Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/textord/makerow.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        makerow.cpp  (Formerly makerows.c)
3
 * Description: Code to arrange blobs into rows of text.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "makerow.h"
25
26
#include "blkocc.h"
27
#include "blobbox.h"
28
#include "ccstruct.h"
29
#include "detlinefit.h"
30
#include "drawtord.h"
31
#include "oldbasel.h"
32
#include "sortflts.h"
33
#include "statistc.h"
34
#include "textord.h"
35
#include "tordmain.h"
36
#include "tovars.h"
37
#include "tprintf.h"
38
#include "underlin.h"
39
40
#include <algorithm>
41
#include <cmath>
42
#include <vector> // for std::vector
43
44
namespace tesseract {
45
46
BOOL_VAR(textord_heavy_nr, false, "Vigorously remove noise");
47
BOOL_VAR(textord_show_initial_rows, false, "Display row accumulation");
48
BOOL_VAR(textord_show_parallel_rows, false, "Display page correlated rows");
49
BOOL_VAR(textord_show_expanded_rows, false, "Display rows after expanding");
50
BOOL_VAR(textord_show_final_rows, false, "Display rows after final fitting");
51
BOOL_VAR(textord_show_final_blobs, false, "Display blob bounds after pre-ass");
52
BOOL_VAR(textord_test_landscape, false, "Tests refer to land/port");
53
BOOL_VAR(textord_parallel_baselines, true, "Force parallel baselines");
54
BOOL_VAR(textord_straight_baselines, false, "Force straight baselines");
55
BOOL_VAR(textord_old_baselines, true, "Use old baseline algorithm");
56
BOOL_VAR(textord_old_xheight, false, "Use old xheight algorithm");
57
BOOL_VAR(textord_fix_xheight_bug, true, "Use spline baseline");
58
BOOL_VAR(textord_fix_makerow_bug, true, "Prevent multiple baselines");
59
BOOL_VAR(textord_debug_xheights, false, "Test xheight algorithms");
60
static BOOL_VAR(textord_biased_skewcalc, true, "Bias skew estimates with line length");
61
static BOOL_VAR(textord_interpolating_skew, true, "Interpolate across gaps");
62
static INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor");
63
static INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
64
INT_VAR(textord_test_x, -INT32_MAX, "coord of test pt");
65
INT_VAR(textord_test_y, -INT32_MAX, "coord of test pt");
66
INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
67
INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
68
INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
69
static INT_VAR(textord_max_blob_overlaps, 4, "Max number of blobs a big blob can overlap");
70
INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
71
double_VAR(textord_spline_shift_fraction, 0.02, "Fraction of line spacing for quad");
72
double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
73
double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation");
74
double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
75
double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
76
double_VAR(textord_chop_width, 1.5, "Max width before chopping");
77
static double_VAR(textord_expansion_factor, 1.0, "Factor to expand rows by in expand_rows");
78
static double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap");
79
double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
80
double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
81
double_VAR(textord_excess_blobsize, 1.3, "New row made if blob makes row this big");
82
double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
83
double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
84
double_VAR(textord_min_blob_height_fraction, 0.75,
85
           "Min blob height/top to include blob top into xheight stats");
86
double_VAR(textord_xheight_mode_fraction, 0.4, "Min pile height to make xheight");
87
double_VAR(textord_ascheight_mode_fraction, 0.08, "Min pile height to make ascheight");
88
static double_VAR(textord_descheight_mode_fraction, 0.08, "Min pile height to make descheight");
89
double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight");
90
double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
91
double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
92
double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
93
double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
94
INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do");
95
BOOL_VAR(textord_new_initial_xheight, true, "Use test xheight mechanism");
96
BOOL_VAR(textord_debug_blob, false, "Print test blob information");
97
98
29.0k
#define MAX_HEIGHT_MODES 12
99
100
const int kMinLeaderCount = 5;
101
102
/**
103
 * @name row_y_order
104
 *
105
 * Sort function to sort rows in y from page top.
106
 */
107
static int row_y_order(       // sort function
108
    const TO_ROW *row1, // items to compare
109
97.9k
    const TO_ROW *row2) {
110
97.9k
  if (row1->parallel_c() > row2->parallel_c()) {
111
5.15k
    return -1;
112
92.8k
  } else if (row1->parallel_c() < row2->parallel_c()) {
113
92.2k
    return 1;
114
92.2k
  } else {
115
542
    return 0;
116
542
  }
117
97.9k
}
118
119
/**
120
 * @name row_spacing_order
121
 *
122
 * Qsort style function to compare 2 TO_ROWS based on their spacing value.
123
 */
124
static int row_spacing_order( // sort function
125
    const TO_ROW *row1, // items to compare
126
170k
    const TO_ROW *row2) {
127
170k
  return row1->spacing < row2->spacing;
128
170k
}
129
130
// Factored-out helper to build a single row from a list of blobs.
131
// Returns the mean blob size.
132
0
static float MakeRowFromBlobs(float line_size, BLOBNBOX_IT *blob_it, TO_ROW_IT *row_it) {
133
0
  blob_it->sort(blob_x_order);
134
0
  blob_it->move_to_first();
135
0
  TO_ROW *row = nullptr;
136
0
  float total_size = 0.0f;
137
0
  int blob_count = 0;
138
  // Add all the blobs to a single TO_ROW.
139
0
  for (; !blob_it->empty(); blob_it->forward()) {
140
0
    BLOBNBOX *blob = blob_it->extract();
141
0
    int top = blob->bounding_box().top();
142
0
    int bottom = blob->bounding_box().bottom();
143
0
    if (row == nullptr) {
144
0
      row = new TO_ROW(blob, top, bottom, line_size);
145
0
      row_it->add_before_then_move(row);
146
0
    } else {
147
0
      row->add_blob(blob, top, bottom, line_size);
148
0
    }
149
0
    total_size += top - bottom;
150
0
    ++blob_count;
151
0
  }
152
0
  return blob_count > 0 ? total_size / blob_count : total_size;
153
0
}
154
155
// Helper to make a row using the children of a single blob.
156
// Returns the mean size of the blobs created.
157
0
static float MakeRowFromSubBlobs(TO_BLOCK *block, C_BLOB *blob, TO_ROW_IT *row_it) {
158
  // The blobs made from the children will go in the small_blobs list.
159
0
  BLOBNBOX_IT bb_it(&block->small_blobs);
160
0
  C_OUTLINE_IT ol_it(blob->out_list());
161
  // Get the children.
162
0
  ol_it.set_to_list(ol_it.data()->child());
163
0
  if (ol_it.empty()) {
164
0
    return 0.0f;
165
0
  }
166
0
  for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
167
    // Deep copy the child outline and use that to make a blob.
168
0
    blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data()));
169
    // Correct direction as needed.
170
0
    blob->CheckInverseFlagAndDirection();
171
0
    auto *bbox = new BLOBNBOX(blob);
172
0
    bb_it.add_after_then_move(bbox);
173
0
  }
174
  // Now we can make a row from the blobs.
175
0
  return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
176
0
}
177
178
/**
179
 * @name make_single_row
180
 *
181
 * Arrange the blobs into a single row... well actually, if there is
182
 * only a single blob, it makes 2 rows, in case the top-level blob
183
 * is a container of the real blobs to recognize.
184
 */
185
float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block,
186
0
                      TO_BLOCK_LIST *blocks) {
187
0
  BLOBNBOX_IT blob_it = &block->blobs;
188
0
  TO_ROW_IT row_it = block->get_rows();
189
190
  // Include all the small blobs and large blobs.
191
0
  blob_it.add_list_after(&block->small_blobs);
192
0
  blob_it.add_list_after(&block->noise_blobs);
193
0
  blob_it.add_list_after(&block->large_blobs);
194
0
  if (block->blobs.singleton() && allow_sub_blobs) {
195
0
    blob_it.move_to_first();
196
0
    float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
197
0
    if (size > block->line_size) {
198
0
      block->line_size = size;
199
0
    }
200
0
  } else if (block->blobs.empty()) {
201
    // Make a fake blob.
202
0
    C_BLOB *blob = C_BLOB::FakeBlob(block->block->pdblk.bounding_box());
203
    // The blobnbox owns the blob.
204
0
    auto *bblob = new BLOBNBOX(blob);
205
0
    blob_it.add_after_then_move(bblob);
206
0
  }
207
0
  MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
208
  // Fit an LMS line to the rows.
209
0
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
210
0
    fit_lms_line(row_it.data());
211
0
  }
212
0
  float gradient;
213
0
  float fit_error;
214
  // Compute the skew based on the fitted line.
215
0
  compute_page_skew(blocks, gradient, fit_error);
216
0
  return gradient;
217
0
}
218
219
/**
220
 * @name make_rows
221
 *
222
 * Arrange the blobs into rows.
223
 */
224
7.74k
float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {
225
7.74k
  float port_m;         // global skew
226
7.74k
  float port_err;       // global noise
227
7.74k
  TO_BLOCK_IT block_it; // iterator
228
229
7.74k
  block_it.set_to_list(port_blocks);
230
15.4k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
231
7.74k
    make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f), !textord_test_landscape);
232
7.74k
  }
233
  // compute globally
234
7.74k
  compute_page_skew(port_blocks, port_m, port_err);
235
7.74k
  block_it.set_to_list(port_blocks);
236
15.4k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
237
7.74k
    cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
238
7.74k
                        block_it.data()->block->pdblk.bounding_box().left(),
239
7.74k
                        !textord_test_landscape);
240
7.74k
  }
241
7.74k
  return port_m; // global skew
242
7.74k
}
243
244
/**
245
 * @name make_initial_textrows
246
 *
247
 * Arrange the good blobs into rows of text.
248
 */
249
void make_initial_textrows( // find lines
250
    ICOORD page_tr,
251
    TO_BLOCK *block, // block to do
252
    FCOORD rotation, // for drawing
253
    bool testing_on  // correct orientation
254
7.74k
) {
255
7.74k
  TO_ROW_IT row_it = block->get_rows();
256
257
#ifndef GRAPHICS_DISABLED
258
  ScrollView::Color colour; // of row
259
260
  if (textord_show_initial_rows && testing_on) {
261
    if (to_win == nullptr) {
262
      create_to_win(page_tr);
263
    }
264
  }
265
#endif
266
  // guess skew
267
7.74k
  assign_blobs_to_rows(block, nullptr, 0, true, true, textord_show_initial_rows && testing_on);
268
7.74k
  row_it.move_to_first();
269
77.6k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
270
69.9k
    fit_lms_line(row_it.data());
271
69.9k
  }
272
#ifndef GRAPHICS_DISABLED
273
  if (textord_show_initial_rows && testing_on) {
274
    colour = ScrollView::RED;
275
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
276
      plot_to_row(row_it.data(), colour, rotation);
277
      colour = static_cast<ScrollView::Color>(colour + 1);
278
      if (colour > ScrollView::MAGENTA) {
279
        colour = ScrollView::RED;
280
      }
281
    }
282
  }
283
#endif
284
7.74k
}
285
286
/**
287
 * @name fit_lms_line
288
 *
289
 * Fit an LMS line to a row.
290
 */
291
69.9k
void fit_lms_line(TO_ROW *row) {
292
69.9k
  float m, c; // fitted line
293
69.9k
  tesseract::DetLineFit lms;
294
69.9k
  BLOBNBOX_IT blob_it = row->blob_list();
295
296
306k
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
297
236k
    const TBOX &box = blob_it.data()->bounding_box();
298
236k
    lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
299
236k
  }
300
69.9k
  double error = lms.Fit(&m, &c);
301
69.9k
  row->set_line(m, c, error);
302
69.9k
}
303
304
/**
305
 * @name compute_page_skew
306
 *
307
 * Compute the skew over a full page by averaging the gradients over
308
 * all the lines. Get the error of the same row.
309
 */
310
void compute_page_skew(    // get average gradient
311
    TO_BLOCK_LIST *blocks, // list of blocks
312
    float &page_m,         // average gradient
313
    float &page_err        // average error
314
7.74k
) {
315
7.74k
  int32_t row_count;             // total rows
316
7.74k
  int32_t blob_count;            // total_blobs
317
7.74k
  int32_t row_err;               // integer error
318
7.74k
  int32_t row_index;             // of total
319
7.74k
  TO_ROW *row;                   // current row
320
7.74k
  TO_BLOCK_IT block_it = blocks; // iterator
321
322
7.74k
  row_count = 0;
323
7.74k
  blob_count = 0;
324
15.4k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
325
7.74k
    POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
326
7.74k
    if (pb != nullptr && !pb->IsText()) {
327
0
      continue; // Pretend non-text blocks don't exist.
328
0
    }
329
7.74k
    row_count += block_it.data()->get_rows()->length();
330
    // count up rows
331
7.74k
    TO_ROW_IT row_it(block_it.data()->get_rows());
332
77.6k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
333
69.9k
      blob_count += row_it.data()->blob_list()->length();
334
69.9k
    }
335
7.74k
  }
336
7.74k
  if (row_count == 0) {
337
207
    page_m = 0.0f;
338
207
    page_err = 0.0f;
339
207
    return;
340
207
  }
341
  // of rows
342
7.53k
  std::vector<float> gradients(blob_count);
343
  // of rows
344
7.53k
  std::vector<float> errors(blob_count);
345
346
7.53k
  row_index = 0;
347
15.0k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
348
7.53k
    POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
349
7.53k
    if (pb != nullptr && !pb->IsText()) {
350
0
      continue; // Pretend non-text blocks don't exist.
351
0
    }
352
7.53k
    TO_ROW_IT row_it(block_it.data()->get_rows());
353
77.4k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
354
69.9k
      row = row_it.data();
355
69.9k
      blob_count = row->blob_list()->length();
356
69.9k
      row_err = static_cast<int32_t>(std::ceil(row->line_error()));
357
69.9k
      if (row_err <= 0) {
358
55.2k
        row_err = 1;
359
55.2k
      }
360
69.9k
      if (textord_biased_skewcalc) {
361
69.9k
        blob_count /= row_err;
362
240k
        for (blob_count /= row_err; blob_count > 0; blob_count--) {
363
170k
          gradients[row_index] = row->line_m();
364
170k
          errors[row_index] = row->line_error();
365
170k
          row_index++;
366
170k
        }
367
69.9k
      } else if (blob_count >= textord_min_blobs_in_row) {
368
        // get gradient
369
0
        gradients[row_index] = row->line_m();
370
0
        errors[row_index] = row->line_error();
371
0
        row_index++;
372
0
      }
373
69.9k
    }
374
7.53k
  }
375
7.53k
  if (row_index == 0) {
376
    // desperate
377
150
    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
378
75
      POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
379
75
      if (pb != nullptr && !pb->IsText()) {
380
0
        continue; // Pretend non-text blocks don't exist.
381
0
      }
382
75
      TO_ROW_IT row_it(block_it.data()->get_rows());
383
321
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
384
246
        row = row_it.data();
385
246
        gradients[row_index] = row->line_m();
386
246
        errors[row_index] = row->line_error();
387
246
        row_index++;
388
246
      }
389
75
    }
390
75
  }
391
7.53k
  row_count = row_index;
392
7.53k
  row_index = static_cast<int32_t>(row_count * textord_skew_ile);
393
7.53k
  gradients.resize(row_count);
394
7.53k
  std::nth_element(gradients.begin(), gradients.begin() + row_index, gradients.end());
395
7.53k
  page_m = gradients[row_index];
396
7.53k
  row_index = static_cast<int32_t>(row_count * textord_skew_ile);
397
7.53k
  errors.resize(row_count);
398
7.53k
  std::nth_element(errors.begin(), errors.begin() + row_index, errors.end());
399
7.53k
  page_err = errors[row_index];
400
7.53k
}
401
402
const double kNoiseSize = 0.5; // Fraction of xheight.
403
const int kMinSize = 8;        // Min pixels to be xheight.
404
405
/**
406
 * Return true if the dot looks like it is part of the i.
407
 * Doesn't work for any other diacritical.
408
 */
409
0
static bool dot_of_i(BLOBNBOX *dot, BLOBNBOX *i, TO_ROW *row) {
410
0
  const TBOX &ibox = i->bounding_box();
411
0
  const TBOX &dotbox = dot->bounding_box();
412
413
  // Must overlap horizontally by enough and be high enough.
414
0
  int overlap = std::min(dotbox.right(), ibox.right()) - std::max(dotbox.left(), ibox.left());
415
0
  if (ibox.height() <= 2 * dotbox.height() ||
416
0
      (overlap * 2 < ibox.width() && overlap < dotbox.width())) {
417
0
    return false;
418
0
  }
419
420
  // If the i is tall and thin then it is good.
421
0
  if (ibox.height() > ibox.width() * 2) {
422
0
    return true; // The i or ! must be tall and thin.
423
0
  }
424
425
  // It might still be tall and thin, but it might be joined to something.
426
  // So search the outline for a piece of large height close to the edges
427
  // of the dot.
428
0
  const double kHeightFraction = 0.6;
429
0
  double target_height = std::min(dotbox.bottom(), ibox.top());
430
0
  target_height -= row->line_m() * dotbox.left() + row->line_c();
431
0
  target_height *= kHeightFraction;
432
0
  int left_min = dotbox.left() - dotbox.width();
433
0
  int middle = (dotbox.left() + dotbox.right()) / 2;
434
0
  int right_max = dotbox.right() + dotbox.width();
435
0
  int left_miny = 0;
436
0
  int left_maxy = 0;
437
0
  int right_miny = 0;
438
0
  int right_maxy = 0;
439
0
  bool found_left = false;
440
0
  bool found_right = false;
441
0
  bool in_left = false;
442
0
  bool in_right = false;
443
0
  C_BLOB *blob = i->cblob();
444
0
  C_OUTLINE_IT o_it = blob->out_list();
445
0
  for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
446
0
    C_OUTLINE *outline = o_it.data();
447
0
    int length = outline->pathlength();
448
0
    ICOORD pos = outline->start_pos();
449
0
    for (int step = 0; step < length; pos += outline->step(step++)) {
450
0
      int x = pos.x();
451
0
      int y = pos.y();
452
0
      if (x >= left_min && x < middle && !found_left) {
453
        // We are in the left part so find min and max y.
454
0
        if (in_left) {
455
0
          if (y > left_maxy) {
456
0
            left_maxy = y;
457
0
          }
458
0
          if (y < left_miny) {
459
0
            left_miny = y;
460
0
          }
461
0
        } else {
462
0
          left_maxy = left_miny = y;
463
0
          in_left = true;
464
0
        }
465
0
      } else if (in_left) {
466
        // We just left the left so look for size.
467
0
        if (left_maxy - left_miny > target_height) {
468
0
          if (found_right) {
469
0
            return true;
470
0
          }
471
0
          found_left = true;
472
0
        }
473
0
        in_left = false;
474
0
      }
475
0
      if (x <= right_max && x > middle && !found_right) {
476
        // We are in the right part so find min and max y.
477
0
        if (in_right) {
478
0
          if (y > right_maxy) {
479
0
            right_maxy = y;
480
0
          }
481
0
          if (y < right_miny) {
482
0
            right_miny = y;
483
0
          }
484
0
        } else {
485
0
          right_maxy = right_miny = y;
486
0
          in_right = true;
487
0
        }
488
0
      } else if (in_right) {
489
        // We just left the right so look for size.
490
0
        if (right_maxy - right_miny > target_height) {
491
0
          if (found_left) {
492
0
            return true;
493
0
          }
494
0
          found_right = true;
495
0
        }
496
0
        in_right = false;
497
0
      }
498
0
    }
499
0
  }
500
0
  return false;
501
0
}
502
503
0
void vigorous_noise_removal(TO_BLOCK *block) {
504
0
  TO_ROW_IT row_it = block->get_rows();
505
0
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
506
0
    TO_ROW *row = row_it.data();
507
0
    BLOBNBOX_IT b_it = row->blob_list();
508
    // Estimate the xheight on the row.
509
0
    int max_height = 0;
510
0
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
511
0
      BLOBNBOX *blob = b_it.data();
512
0
      if (blob->bounding_box().height() > max_height) {
513
0
        max_height = blob->bounding_box().height();
514
0
      }
515
0
    }
516
0
    STATS hstats(0, max_height);
517
0
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
518
0
      BLOBNBOX *blob = b_it.data();
519
0
      int height = blob->bounding_box().height();
520
0
      if (height >= kMinSize) {
521
0
        hstats.add(blob->bounding_box().height(), 1);
522
0
      }
523
0
    }
524
0
    float xheight = hstats.median();
525
    // Delete small objects.
526
0
    BLOBNBOX *prev = nullptr;
527
0
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
528
0
      BLOBNBOX *blob = b_it.data();
529
0
      const TBOX &box = blob->bounding_box();
530
0
      if (box.height() < kNoiseSize * xheight) {
531
        // Small so delete unless it looks like an i dot.
532
0
        if (prev != nullptr) {
533
0
          if (dot_of_i(blob, prev, row)) {
534
0
            continue; // Looks OK.
535
0
          }
536
0
        }
537
0
        if (!b_it.at_last()) {
538
0
          BLOBNBOX *next = b_it.data_relative(1);
539
0
          if (dot_of_i(blob, next, row)) {
540
0
            continue; // Looks OK.
541
0
          }
542
0
        }
543
        // It might be noise so get rid of it.
544
0
        delete blob->remove_cblob();
545
0
        delete b_it.extract();
546
0
      } else {
547
0
        prev = blob;
548
0
      }
549
0
    }
550
0
  }
551
0
}
552
553
/**
554
 * cleanup_rows_making
555
 *
556
 * Remove overlapping rows and fit all the blobs to what's left.
557
 */
558
void cleanup_rows_making( // find lines
559
    ICOORD page_tr,       // top right
560
    TO_BLOCK *block,      // block to do
561
    float gradient,       // gradient to fit
562
    FCOORD rotation,      // for drawing
563
    int32_t block_edge,   // edge of block
564
    bool testing_on       // correct orientation
565
7.74k
) {
566
  // iterators
567
7.74k
  BLOBNBOX_IT blob_it = &block->blobs;
568
7.74k
  TO_ROW_IT row_it = block->get_rows();
569
570
#ifndef GRAPHICS_DISABLED
571
  if (textord_show_parallel_rows && testing_on) {
572
    if (to_win == nullptr) {
573
      create_to_win(page_tr);
574
    }
575
  }
576
#endif
577
  // get row coords
578
7.74k
  fit_parallel_rows(block, gradient, rotation, block_edge,
579
7.74k
                    textord_show_parallel_rows && testing_on);
580
7.74k
  delete_non_dropout_rows(block, gradient, rotation, block_edge,
581
7.74k
                          textord_show_parallel_rows && testing_on);
582
7.74k
  expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
583
7.74k
  blob_it.set_to_list(&block->blobs);
584
7.74k
  row_it.set_to_list(block->get_rows());
585
39.7k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
586
31.9k
    blob_it.add_list_after(row_it.data()->blob_list());
587
31.9k
  }
588
  // give blobs back
589
7.74k
  assign_blobs_to_rows(block, &gradient, 1, false, false, false);
590
  // now new rows must be genuine
591
7.74k
  blob_it.set_to_list(&block->blobs);
592
7.74k
  blob_it.add_list_after(&block->large_blobs);
593
7.74k
  assign_blobs_to_rows(block, &gradient, 2, true, true, false);
594
  // safe to use big ones now
595
7.74k
  blob_it.set_to_list(&block->blobs);
596
  // throw all blobs in
597
7.74k
  blob_it.add_list_after(&block->noise_blobs);
598
7.74k
  blob_it.add_list_after(&block->small_blobs);
599
7.74k
  assign_blobs_to_rows(block, &gradient, 3, false, false, false);
600
7.74k
}
601
602
/**
603
 * delete_non_dropout_rows
604
 *
605
 * Compute the linespacing and offset.
606
 */
607
void delete_non_dropout_rows( // find lines
608
    TO_BLOCK *block,          // block to do
609
    float gradient,           // global skew
610
    FCOORD rotation,          // deskew vector
611
    int32_t block_edge,       // left edge
612
    bool testing_on           // correct orientation
613
7.74k
) {
614
7.74k
  TBOX block_box; // deskewed block
615
7.74k
  int32_t max_y;  // in block
616
7.74k
  int32_t min_y;
617
7.74k
  int32_t line_index; // of scan line
618
7.74k
  int32_t line_count; // no of scan lines
619
7.74k
  int32_t distance;   // to drop-out
620
7.74k
  int32_t xleft;      // of block
621
7.74k
  int32_t ybottom;    // of block
622
7.74k
  TO_ROW *row;        // current row
623
7.74k
  TO_ROW_IT row_it = block->get_rows();
624
7.74k
  BLOBNBOX_IT blob_it = &block->blobs;
625
626
7.74k
  if (row_it.empty()) {
627
207
    return; // empty block
628
207
  }
629
7.53k
  block_box = deskew_block_coords(block, gradient);
630
7.53k
  xleft = block->block->pdblk.bounding_box().left();
631
7.53k
  ybottom = block->block->pdblk.bounding_box().bottom();
632
7.53k
  min_y = block_box.bottom() - 1;
633
7.53k
  max_y = block_box.top() + 1;
634
77.4k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
635
69.9k
    line_index = static_cast<int32_t>(std::floor(row_it.data()->intercept()));
636
69.9k
    if (line_index <= min_y) {
637
667
      min_y = line_index - 1;
638
667
    }
639
69.9k
    if (line_index >= max_y) {
640
0
      max_y = line_index + 1;
641
0
    }
642
69.9k
  }
643
7.53k
  line_count = max_y - min_y + 1;
644
7.53k
  if (line_count <= 0) {
645
0
    return; // empty block
646
0
  }
647
  // change in occupation
648
7.53k
  std::vector<int32_t> deltas(line_count);
649
  // of pixel coords
650
7.53k
  std::vector<int32_t> occupation(line_count);
651
652
7.53k
  compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]);
653
7.53k
  compute_occupation_threshold(
654
7.53k
      static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kDescenderFraction +
655
7.53k
                                                       tesseract::CCStruct::kAscenderFraction))),
656
7.53k
      static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kXHeightFraction +
657
7.53k
                                                       tesseract::CCStruct::kAscenderFraction))),
658
7.53k
      max_y - min_y + 1, &occupation[0], &deltas[0]);
659
#ifndef GRAPHICS_DISABLED
660
  if (testing_on) {
661
    draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);
662
  }
663
#endif
664
7.53k
  compute_dropout_distances(&occupation[0], &deltas[0], line_count);
665
77.4k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
666
69.9k
    row = row_it.data();
667
69.9k
    line_index = static_cast<int32_t>(std::floor(row->intercept()));
668
69.9k
    distance = deltas[line_index - min_y];
669
69.9k
    if (find_best_dropout_row(row, distance, block->line_spacing / 2, line_index, &row_it,
670
69.9k
                              testing_on)) {
671
#ifndef GRAPHICS_DISABLED
672
      if (testing_on) {
673
        plot_parallel_row(row, gradient, block_edge, ScrollView::WHITE, rotation);
674
      }
675
#endif
676
37.0k
      blob_it.add_list_after(row_it.data()->blob_list());
677
37.0k
      delete row_it.extract(); // too far away
678
37.0k
    }
679
69.9k
  }
680
40.3k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
681
32.8k
    blob_it.add_list_after(row_it.data()->blob_list());
682
32.8k
  }
683
7.53k
}
684
685
/**
686
 * @name find_best_dropout_row
687
 *
688
 * Delete this row if it has a neighbour with better dropout characteristics.
689
 * true is returned if the row should be deleted.
690
 */
691
bool find_best_dropout_row( // find neighbours
692
    TO_ROW *row,            // row to test
693
    int32_t distance,       // dropout dist
694
    float dist_limit,       // threshold distance
695
    int32_t line_index,     // index of row
696
    TO_ROW_IT *row_it,      // current position
697
    bool testing_on         // correct orientation
698
69.9k
) {
699
69.9k
  int32_t next_index; // of neighbouring row
700
69.9k
  int32_t row_offset; // from current row
701
69.9k
  int32_t abs_dist;   // absolute distance
702
69.9k
  int8_t row_inc;     // increment to row_index
703
69.9k
  TO_ROW *next_row;   // nextious row
704
705
69.9k
  if (testing_on) {
706
0
    tprintf("Row at %g(%g), dropout dist=%d,", row->intercept(), row->parallel_c(), distance);
707
0
  }
708
69.9k
  if (distance < 0) {
709
34.1k
    row_inc = 1;
710
34.1k
    abs_dist = -distance;
711
35.8k
  } else {
712
35.8k
    row_inc = -1;
713
35.8k
    abs_dist = distance;
714
35.8k
  }
715
69.9k
  if (abs_dist > dist_limit) {
716
24.9k
    if (testing_on) {
717
0
      tprintf(" too far - deleting\n");
718
0
    }
719
24.9k
    return true;
720
24.9k
  }
721
44.9k
  if ((distance < 0 && !row_it->at_last()) || (distance >= 0 && !row_it->at_first())) {
722
39.9k
    row_offset = row_inc;
723
40.5k
    do {
724
40.5k
      next_row = row_it->data_relative(row_offset);
725
40.5k
      next_index = static_cast<int32_t>(std::floor(next_row->intercept()));
726
40.5k
      if ((distance < 0 && next_index < line_index &&
727
40.5k
           next_index > line_index + distance + distance) ||
728
40.5k
          (distance >= 0 && next_index > line_index &&
729
32.7k
           next_index < line_index + distance + distance)) {
730
11.3k
        if (testing_on) {
731
0
          tprintf(" nearer neighbour (%d) at %g\n", line_index + distance - next_index,
732
0
                  next_row->intercept());
733
0
        }
734
11.3k
        return true; // other is nearer
735
29.2k
      } else if (next_index == line_index || next_index == line_index + distance + distance) {
736
1.37k
        if (row->believability() <= next_row->believability()) {
737
749
          if (testing_on) {
738
0
            tprintf(" equal but more believable at %g (%g/%g)\n", next_row->intercept(),
739
0
                    row->believability(), next_row->believability());
740
0
          }
741
749
          return true; // other is more believable
742
749
        }
743
1.37k
      }
744
28.4k
      row_offset += row_inc;
745
28.4k
    } while ((next_index == line_index || next_index == line_index + distance + distance) &&
746
28.4k
             row_offset < row_it->length());
747
27.8k
    if (testing_on) {
748
0
      tprintf(" keeping\n");
749
0
    }
750
27.8k
  }
751
32.8k
  return false;
752
44.9k
}
753
754
/**
755
 * @name deskew_block_coords
756
 *
757
 * Compute the bounding box of all the blobs in the block
758
 * if they were deskewed without actually doing it.
759
 */
760
TBOX deskew_block_coords( // block box
761
    TO_BLOCK *block,      // block to do
762
    float gradient        // global skew
763
7.53k
) {
764
7.53k
  TBOX result;     // block bounds
765
7.53k
  TBOX blob_box;   // of block
766
7.53k
  FCOORD rotation; // deskew vector
767
7.53k
  float length;    // of gradient vector
768
7.53k
  TO_ROW_IT row_it = block->get_rows();
769
7.53k
  TO_ROW *row;         // current row
770
7.53k
  BLOBNBOX *blob;      // current blob
771
7.53k
  BLOBNBOX_IT blob_it; // iterator
772
773
7.53k
  length = std::sqrt(gradient * gradient + 1);
774
7.53k
  rotation = FCOORD(1 / length, -gradient / length);
775
77.4k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
776
69.9k
    row = row_it.data();
777
69.9k
    blob_it.set_to_list(row->blob_list());
778
306k
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
779
236k
      blob = blob_it.data();
780
236k
      blob_box = blob->bounding_box();
781
236k
      blob_box.rotate(rotation); // de-skew it
782
236k
      result += blob_box;
783
236k
    }
784
69.9k
  }
785
7.53k
  return result;
786
7.53k
}
787
788
/**
789
 * @name compute_line_occupation
790
 *
791
 * Compute the pixel projection back on the y axis given the global
792
 * skew. Also compute the 1st derivative.
793
 */
794
void compute_line_occupation( // project blobs
795
    TO_BLOCK *block,          // block to do
796
    float gradient,           // global skew
797
    int32_t min_y,            // min coord in block
798
    int32_t max_y,            // in block
799
    int32_t *occupation,      // output projection
800
    int32_t *deltas           // derivative
801
7.53k
) {
802
7.53k
  int32_t line_count; // maxy-miny+1
803
7.53k
  int32_t line_index; // of scan line
804
7.53k
  int index;          // array index for daft compilers
805
7.53k
  TO_ROW *row;        // current row
806
7.53k
  TO_ROW_IT row_it = block->get_rows();
807
7.53k
  BLOBNBOX *blob;      // current blob
808
7.53k
  BLOBNBOX_IT blob_it; // iterator
809
7.53k
  float length;        // of skew vector
810
7.53k
  TBOX blob_box;       // bounding box
811
7.53k
  FCOORD rotation;     // inverse of skew
812
813
7.53k
  line_count = max_y - min_y + 1;
814
7.53k
  length = std::sqrt(gradient * gradient + 1);
815
7.53k
  rotation = FCOORD(1 / length, -gradient / length);
816
661k
  for (line_index = 0; line_index < line_count; line_index++) {
817
653k
    deltas[line_index] = 0;
818
653k
  }
819
77.4k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
820
69.9k
    row = row_it.data();
821
69.9k
    blob_it.set_to_list(row->blob_list());
822
306k
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
823
236k
      blob = blob_it.data();
824
236k
      blob_box = blob->bounding_box();
825
236k
      blob_box.rotate(rotation); // de-skew it
826
236k
      int32_t width = blob_box.right() - blob_box.left();
827
236k
      index = blob_box.bottom() - min_y;
828
236k
      ASSERT_HOST(index >= 0 && index < line_count);
829
      // count transitions
830
236k
      deltas[index] += width;
831
236k
      index = blob_box.top() - min_y;
832
236k
      ASSERT_HOST(index >= 0 && index < line_count);
833
236k
      deltas[index] -= width;
834
236k
    }
835
69.9k
  }
836
7.53k
  occupation[0] = deltas[0];
837
653k
  for (line_index = 1; line_index < line_count; line_index++) {
838
646k
    occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
839
646k
  }
840
7.53k
}
841
842
/**
843
 * compute_occupation_threshold
844
 *
845
 * Compute thresholds for textline or not for the occupation array.
846
 */
847
void compute_occupation_threshold( // project blobs
848
    int32_t low_window,            // below result point
849
    int32_t high_window,           // above result point
850
    int32_t line_count,            // array sizes
851
    int32_t *occupation,           // input projection
852
    int32_t *thresholds            // output thresholds
853
7.53k
) {
854
7.53k
  int32_t line_index; // of thresholds line
855
7.53k
  int32_t low_index;  // in occupation
856
7.53k
  int32_t high_index; // in occupation
857
7.53k
  int32_t sum;        // current average
858
7.53k
  int32_t divisor;    // to get thresholds
859
7.53k
  int32_t min_index;  // of min occ
860
7.53k
  int32_t min_occ;    // min in locality
861
7.53k
  int32_t test_index; // for finding min
862
863
7.53k
  divisor = static_cast<int32_t>(ceil((low_window + high_window) / textord_occupancy_threshold));
864
7.53k
  if (low_window + high_window < line_count) {
865
92.8k
    for (sum = 0, high_index = 0; high_index < low_window; high_index++) {
866
86.7k
      sum += occupation[high_index];
867
86.7k
    }
868
135k
    for (low_index = 0; low_index < high_window; low_index++, high_index++) {
869
129k
      sum += occupation[high_index];
870
129k
    }
871
6.08k
    min_occ = occupation[0];
872
6.08k
    min_index = 0;
873
216k
    for (test_index = 1; test_index < high_index; test_index++) {
874
210k
      if (occupation[test_index] <= min_occ) {
875
41.8k
        min_occ = occupation[test_index];
876
41.8k
        min_index = test_index; // find min in region
877
41.8k
      }
878
210k
    }
879
92.8k
    for (line_index = 0; line_index < low_window; line_index++) {
880
86.7k
      thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
881
86.7k
    }
882
    // same out to end
883
365k
    for (low_index = 0; high_index < line_count; low_index++, high_index++) {
884
359k
      sum -= occupation[low_index];
885
359k
      sum += occupation[high_index];
886
359k
      if (occupation[high_index] <= min_occ) {
887
        // find min in region
888
85.9k
        min_occ = occupation[high_index];
889
85.9k
        min_index = high_index;
890
85.9k
      }
891
      // lost min from region
892
359k
      if (min_index <= low_index) {
893
6.08k
        min_occ = occupation[low_index + 1];
894
6.08k
        min_index = low_index + 1;
895
191k
        for (test_index = low_index + 2; test_index <= high_index; test_index++) {
896
185k
          if (occupation[test_index] <= min_occ) {
897
30.0k
            min_occ = occupation[test_index];
898
            // find min in region
899
30.0k
            min_index = test_index;
900
30.0k
          }
901
185k
        }
902
6.08k
      }
903
359k
      thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
904
359k
    }
905
6.08k
  } else {
906
1.44k
    min_occ = occupation[0];
907
1.44k
    min_index = 0;
908
79.0k
    for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
909
77.6k
      if (occupation[low_index] < min_occ) {
910
0
        min_occ = occupation[low_index];
911
0
        min_index = low_index;
912
0
      }
913
77.6k
      sum += occupation[low_index];
914
77.6k
    }
915
1.44k
    line_index = 0;
916
1.44k
  }
917
214k
  for (; line_index < line_count; line_index++) {
918
206k
    thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
919
206k
  }
920
  // same out to end
921
7.53k
}
922
923
/**
924
 * @name compute_dropout_distances
925
 *
926
 * Compute the distance from each coordinate to the nearest dropout.
927
 */
928
void compute_dropout_distances( // project blobs
929
    int32_t *occupation,        // input projection
930
    int32_t *thresholds,        // output thresholds
931
    int32_t line_count          // array sizes
932
7.53k
) {
933
7.53k
  int32_t line_index;     // of thresholds line
934
7.53k
  int32_t distance;       // from prev dropout
935
7.53k
  int32_t next_dist;      // to next dropout
936
7.53k
  int32_t back_index;     // for back filling
937
7.53k
  int32_t prev_threshold; // before overwrite
938
939
7.53k
  distance = -line_count;
940
7.53k
  line_index = 0;
941
41.2k
  do {
942
653k
    do {
943
653k
      distance--;
944
653k
      prev_threshold = thresholds[line_index];
945
      // distance from prev
946
653k
      thresholds[line_index] = distance;
947
653k
      line_index++;
948
653k
    } while (line_index < line_count && (occupation[line_index] < thresholds[line_index] ||
949
646k
                                         occupation[line_index - 1] >= prev_threshold));
950
41.2k
    if (line_index < line_count) {
951
33.6k
      back_index = line_index - 1;
952
33.6k
      next_dist = 1;
953
180k
      while (next_dist < -distance && back_index >= 0) {
954
146k
        thresholds[back_index] = next_dist;
955
146k
        back_index--;
956
146k
        next_dist++;
957
146k
        distance++;
958
146k
      }
959
33.6k
      distance = 1;
960
33.6k
    }
961
41.2k
  } while (line_index < line_count);
962
7.53k
}
963
964
/**
965
 * @name expand_rows
966
 *
967
 * Expand each row to the least of its allowed size and touching its
968
 * neighbours. If the expansion would entirely swallow a neighbouring row
969
 * then do so.
970
 */
971
void expand_rows(       // find lines
972
    ICOORD page_tr,     // top right
973
    TO_BLOCK *block,    // block to do
974
    float gradient,     // gradient to fit
975
    FCOORD rotation,    // for drawing
976
    int32_t block_edge, // edge of block
977
    bool testing_on     // correct orientation
978
7.74k
) {
979
7.74k
  bool swallowed_row;    // eaten a neighbour
980
7.74k
  float y_max, y_min;    // new row limits
981
7.74k
  float y_bottom, y_top; // allowed limits
982
7.74k
  TO_ROW *test_row;      // next row
983
7.74k
  TO_ROW *row;           // current row
984
                         // iterators
985
7.74k
  BLOBNBOX_IT blob_it = &block->blobs;
986
7.74k
  TO_ROW_IT row_it = block->get_rows();
987
988
#ifndef GRAPHICS_DISABLED
989
  if (textord_show_expanded_rows && testing_on) {
990
    if (to_win == nullptr) {
991
      create_to_win(page_tr);
992
    }
993
  }
994
#endif
995
996
7.74k
  adjust_row_limits(block); // shift min,max.
997
7.74k
  if (textord_new_initial_xheight) {
998
7.74k
    if (block->get_rows()->empty()) {
999
2.60k
      return;
1000
2.60k
    }
1001
5.13k
    compute_row_stats(block, textord_show_expanded_rows && testing_on);
1002
5.13k
  }
1003
5.13k
  assign_blobs_to_rows(block, &gradient, 4, true, false, false);
1004
  // get real membership
1005
5.13k
  if (block->get_rows()->empty()) {
1006
15
    return;
1007
15
  }
1008
5.12k
  fit_parallel_rows(block, gradient, rotation, block_edge,
1009
5.12k
                    textord_show_expanded_rows && testing_on);
1010
5.12k
  if (!textord_new_initial_xheight) {
1011
0
    compute_row_stats(block, textord_show_expanded_rows && testing_on);
1012
0
  }
1013
5.12k
  row_it.move_to_last();
1014
31.9k
  do {
1015
31.9k
    row = row_it.data();
1016
31.9k
    y_max = row->max_y(); // get current limits
1017
31.9k
    y_min = row->min_y();
1018
31.9k
    y_bottom = row->intercept() - block->line_size * textord_expansion_factor *
1019
31.9k
                                      tesseract::CCStruct::kDescenderFraction;
1020
31.9k
    y_top = row->intercept() +
1021
31.9k
            block->line_size * textord_expansion_factor *
1022
31.9k
                (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction);
1023
31.9k
    if (y_min > y_bottom) { // expansion allowed
1024
24.5k
      if (textord_show_expanded_rows && testing_on) {
1025
0
        tprintf("Expanding bottom of row at %f from %f to %f\n", row->intercept(), y_min, y_bottom);
1026
0
      }
1027
      // expandable
1028
24.5k
      swallowed_row = true;
1029
45.0k
      while (swallowed_row && !row_it.at_last()) {
1030
20.5k
        swallowed_row = false;
1031
        // get next one
1032
20.5k
        test_row = row_it.data_relative(1);
1033
        // overlaps space
1034
20.5k
        if (test_row->max_y() > y_bottom) {
1035
17.0k
          if (test_row->min_y() > y_bottom) {
1036
29
            if (textord_show_expanded_rows && testing_on) {
1037
0
              tprintf("Eating row below at %f\n", test_row->intercept());
1038
0
            }
1039
29
            row_it.forward();
1040
#ifndef GRAPHICS_DISABLED
1041
            if (textord_show_expanded_rows && testing_on) {
1042
              plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation);
1043
            }
1044
#endif
1045
29
            blob_it.set_to_list(row->blob_list());
1046
29
            blob_it.add_list_after(test_row->blob_list());
1047
            // swallow complete row
1048
29
            delete row_it.extract();
1049
29
            row_it.backward();
1050
29
            swallowed_row = true;
1051
17.0k
          } else if (test_row->max_y() < y_min) {
1052
            // shorter limit
1053
925
            y_bottom = test_row->max_y();
1054
925
            if (textord_show_expanded_rows && testing_on) {
1055
0
              tprintf("Truncating limit to %f due to touching row at %f\n", y_bottom,
1056
0
                      test_row->intercept());
1057
0
            }
1058
16.1k
          } else {
1059
16.1k
            y_bottom = y_min; // can't expand it
1060
16.1k
            if (textord_show_expanded_rows && testing_on) {
1061
0
              tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_bottom,
1062
0
                      test_row->intercept());
1063
0
            }
1064
16.1k
          }
1065
17.0k
        }
1066
20.5k
      }
1067
24.5k
      y_min = y_bottom; // expand it
1068
24.5k
    }
1069
31.9k
    if (y_max < y_top) { // expansion allowed
1070
21.6k
      if (textord_show_expanded_rows && testing_on) {
1071
0
        tprintf("Expanding top of row at %f from %f to %f\n", row->intercept(), y_max, y_top);
1072
0
      }
1073
21.6k
      swallowed_row = true;
1074
39.3k
      while (swallowed_row && !row_it.at_first()) {
1075
17.7k
        swallowed_row = false;
1076
        // get one above
1077
17.7k
        test_row = row_it.data_relative(-1);
1078
17.7k
        if (test_row->min_y() < y_top) {
1079
13.4k
          if (test_row->max_y() < y_top) {
1080
8
            if (textord_show_expanded_rows && testing_on) {
1081
0
              tprintf("Eating row above at %f\n", test_row->intercept());
1082
0
            }
1083
8
            row_it.backward();
1084
8
            blob_it.set_to_list(row->blob_list());
1085
#ifndef GRAPHICS_DISABLED
1086
            if (textord_show_expanded_rows && testing_on) {
1087
              plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation);
1088
            }
1089
#endif
1090
8
            blob_it.add_list_after(test_row->blob_list());
1091
            // swallow complete row
1092
8
            delete row_it.extract();
1093
8
            row_it.forward();
1094
8
            swallowed_row = true;
1095
13.4k
          } else if (test_row->min_y() < y_max) {
1096
            // shorter limit
1097
12.7k
            y_top = test_row->min_y();
1098
12.7k
            if (textord_show_expanded_rows && testing_on) {
1099
0
              tprintf("Truncating limit to %f due to touching row at %f\n", y_top,
1100
0
                      test_row->intercept());
1101
0
            }
1102
12.7k
          } else {
1103
696
            y_top = y_max; // can't expand it
1104
696
            if (textord_show_expanded_rows && testing_on) {
1105
0
              tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_top,
1106
0
                      test_row->intercept());
1107
0
            }
1108
696
          }
1109
13.4k
        }
1110
17.7k
      }
1111
21.6k
      y_max = y_top;
1112
21.6k
    }
1113
    // new limits
1114
31.9k
    row->set_limits(y_min, y_max);
1115
31.9k
    row_it.backward();
1116
31.9k
  } while (!row_it.at_last());
1117
5.12k
}
1118
1119
/**
1120
 * adjust_row_limits
1121
 *
1122
 * Change the limits of rows to suit the default fractions.
1123
 */
1124
void adjust_row_limits( // tidy limits
1125
    TO_BLOCK *block     // block to do
1126
7.74k
) {
1127
7.74k
  TO_ROW *row; // current row
1128
7.74k
  float size;  // size of row
1129
7.74k
  float ymax;  // top of row
1130
7.74k
  float ymin;  // bottom of row
1131
7.74k
  TO_ROW_IT row_it = block->get_rows();
1132
1133
7.74k
  if (textord_show_expanded_rows) {
1134
0
    tprintf("Adjusting row limits for block(%d,%d)\n", block->block->pdblk.bounding_box().left(),
1135
0
            block->block->pdblk.bounding_box().top());
1136
0
  }
1137
40.5k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1138
32.8k
    row = row_it.data();
1139
32.8k
    size = row->max_y() - row->min_y();
1140
32.8k
    if (textord_show_expanded_rows) {
1141
0
      tprintf("Row at %f has min %f, max %f, size %f\n", row->intercept(), row->min_y(),
1142
0
              row->max_y(), size);
1143
0
    }
1144
32.8k
    size /= tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction +
1145
32.8k
            tesseract::CCStruct::kDescenderFraction;
1146
32.8k
    ymax = size * (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction);
1147
32.8k
    ymin = -size * tesseract::CCStruct::kDescenderFraction;
1148
32.8k
    row->set_limits(row->intercept() + ymin, row->intercept() + ymax);
1149
32.8k
    row->merged = false;
1150
32.8k
  }
1151
7.74k
}
1152
1153
/**
1154
 * @name compute_row_stats
1155
 *
1156
 * Compute the linespacing and offset.
1157
 */
1158
void compute_row_stats( // find lines
1159
    TO_BLOCK *block,    // block to do
1160
    bool testing_on     // correct orientation
1161
5.13k
) {
1162
5.13k
  int32_t row_index; // of median
1163
5.13k
  TO_ROW *row;       // current row
1164
5.13k
  TO_ROW *prev_row;  // previous row
1165
5.13k
  float iqr;         // inter quartile range
1166
5.13k
  TO_ROW_IT row_it = block->get_rows();
1167
  // number of rows
1168
5.13k
  int16_t rowcount = row_it.length();
1169
  // for choose nth
1170
5.13k
  std::vector<TO_ROW *> rows(rowcount);
1171
5.13k
  rowcount = 0;
1172
5.13k
  prev_row = nullptr;
1173
5.13k
  row_it.move_to_last(); // start at bottom
1174
32.8k
  do {
1175
32.8k
    row = row_it.data();
1176
32.8k
    if (prev_row != nullptr) {
1177
27.7k
      rows[rowcount++] = prev_row;
1178
27.7k
      prev_row->spacing = row->intercept() - prev_row->intercept();
1179
27.7k
      if (prev_row->spacing < 0.1 && prev_row->spacing > -0.1) {
1180
        // Avoid small spacing values which give a small disp_quant_factor_.
1181
        // That can cause large memory allocations with out-of-memory.
1182
189
        prev_row->spacing = 0;
1183
189
      }
1184
27.7k
      if (testing_on) {
1185
0
        tprintf("Row at %g yields spacing of %g\n", row->intercept(), prev_row->spacing);
1186
0
      }
1187
27.7k
    }
1188
32.8k
    prev_row = row;
1189
32.8k
    row_it.backward();
1190
32.8k
  } while (!row_it.at_last());
1191
5.13k
  block->key_row = prev_row;
1192
5.13k
  block->baseline_offset = std::fmod(prev_row->parallel_c(), block->line_spacing);
1193
5.13k
  if (testing_on) {
1194
0
    tprintf("Blob based spacing=(%g,%g), offset=%g", block->line_size, block->line_spacing,
1195
0
            block->baseline_offset);
1196
0
  }
1197
5.13k
  if (rowcount > 0) {
1198
4.01k
    rows.resize(rowcount);
1199
4.01k
    row_index = rowcount * 3 / 4;
1200
4.01k
    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1201
4.01k
    iqr = rows[row_index]->spacing;
1202
4.01k
    row_index = rowcount / 4;
1203
4.01k
    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1204
4.01k
    iqr -= rows[row_index]->spacing;
1205
4.01k
    row_index = rowcount / 2;
1206
4.01k
    std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1207
4.01k
    block->key_row = rows[row_index];
1208
4.01k
    if (testing_on) {
1209
0
      tprintf(" row based=%g(%g)", rows[row_index]->spacing, iqr);
1210
0
    }
1211
4.01k
    if (rowcount > 2 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1212
1.46k
      if (!textord_new_initial_xheight) {
1213
0
        if (rows[row_index]->spacing < block->line_spacing &&
1214
0
            rows[row_index]->spacing > block->line_size) {
1215
          // within range
1216
0
          block->line_size = rows[row_index]->spacing;
1217
        // spacing=size
1218
0
        } else if (rows[row_index]->spacing > block->line_spacing) {
1219
0
          block->line_size = block->line_spacing;
1220
0
        }
1221
        // too big so use max
1222
1.46k
      } else {
1223
1.46k
        if (rows[row_index]->spacing < block->line_spacing) {
1224
1.46k
          block->line_size = rows[row_index]->spacing;
1225
1.46k
        } else {
1226
2
          block->line_size = block->line_spacing;
1227
2
        }
1228
        // too big so use max
1229
1.46k
      }
1230
1.46k
      if (block->line_size < textord_min_xheight) {
1231
1.26k
        block->line_size = (float)textord_min_xheight;
1232
1.26k
      }
1233
1.46k
      block->line_spacing = rows[row_index]->spacing;
1234
1.46k
      block->max_blob_size = block->line_spacing * textord_excess_blobsize;
1235
1.46k
    }
1236
4.01k
    block->baseline_offset = std::fmod(rows[row_index]->intercept(), block->line_spacing);
1237
4.01k
  }
1238
5.13k
  if (testing_on) {
1239
0
    tprintf("\nEstimate line size=%g, spacing=%g, offset=%g\n", block->line_size,
1240
0
            block->line_spacing, block->baseline_offset);
1241
0
  }
1242
5.13k
}
1243
1244
/**
1245
 * @name compute_block_xheight
1246
 *
1247
 * Compute the xheight of the individual rows, then correlate them
1248
 * and interpret ascenderless lines, correcting xheights.
1249
 *
1250
 * First we compute our best guess of the x-height of each row independently
1251
 * with compute_row_xheight(), which looks for a pair of commonly occurring
1252
 * heights that could be x-height and ascender height. This function also
1253
 * attempts to find descenders of lowercase letters (i.e. not the small
1254
 * descenders that could appear in upper case letters as Q,J).
1255
 *
1256
 * After this computation each row falls into one of the following categories:
1257
 * ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be
1258
 *                      a regular row; we'll use its xheight to compute
1259
 *                      xheight and ascrise estimates for the block
1260
 * ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in
1261
 *                       the xheight of this row (don't use it for estimating
1262
 *                       block xheight), but this row can't contain all caps
1263
 * ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase
1264
 *              (or mostly lowercase for fonts with very few ascenders),
1265
 *              all upper case or small caps
1266
 * ROW_INVALID: no meaningful xheight could be found for this row
1267
 *
1268
 * We then run correct_row_xheight() and use the computed xheight and ascrise
1269
 * averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND,
1270
 * ROW_UNKNOWN and ROW_INVALID categories.
1271
 *
1272
 */
1273
15.2k
void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
1274
15.2k
  TO_ROW *row; // current row
1275
15.2k
  float asc_frac_xheight = CCStruct::kAscenderFraction / CCStruct::kXHeightFraction;
1276
15.2k
  float desc_frac_xheight = CCStruct::kDescenderFraction / CCStruct::kXHeightFraction;
1277
15.2k
  int32_t min_height, max_height; // limits on xheight
1278
15.2k
  TO_ROW_IT row_it = block->get_rows();
1279
15.2k
  if (row_it.empty()) {
1280
190
    return; // no rows
1281
190
  }
1282
1283
  // Compute the best guess of xheight of each row individually.
1284
  // Use xheight and ascrise values of the rows where ascenders were found.
1285
15.1k
  get_min_max_xheight(block->line_size, &min_height, &max_height);
1286
15.1k
  STATS row_asc_xheights(min_height, max_height);
1287
15.1k
  STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
1288
15.1k
                        static_cast<int>(max_height * asc_frac_xheight));
1289
15.1k
  int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
1290
15.1k
  int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
1291
15.1k
  STATS row_asc_descdrop(min_desc_height, max_desc_height);
1292
15.1k
  STATS row_desc_xheights(min_height, max_height);
1293
15.1k
  STATS row_desc_descdrop(min_desc_height, max_desc_height);
1294
15.1k
  STATS row_cap_xheights(min_height, max_height);
1295
15.1k
  STATS row_cap_floating_xheights(min_height, max_height);
1296
127k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1297
112k
    row = row_it.data();
1298
    // Compute the xheight of this row if it has not been computed before.
1299
112k
    if (row->xheight <= 0) {
1300
30.5k
      compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size);
1301
30.5k
    }
1302
112k
    ROW_CATEGORY row_category = get_row_category(row);
1303
112k
    if (row_category == ROW_ASCENDERS_FOUND) {
1304
61.1k
      row_asc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);
1305
61.1k
      row_asc_ascrise.add(static_cast<int32_t>(row->ascrise), row->xheight_evidence);
1306
61.1k
      row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);
1307
61.1k
    } else if (row_category == ROW_DESCENDERS_FOUND) {
1308
5.68k
      row_desc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence);
1309
5.68k
      row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence);
1310
45.7k
    } else if (row_category == ROW_UNKNOWN) {
1311
15.1k
      fill_heights(row, gradient, min_height, max_height, &row_cap_xheights,
1312
15.1k
                   &row_cap_floating_xheights);
1313
15.1k
    }
1314
112k
  }
1315
1316
15.1k
  float xheight = 0.0;
1317
15.1k
  float ascrise = 0.0;
1318
15.1k
  float descdrop = 0.0;
1319
  // Compute our best guess of xheight of this block.
1320
15.1k
  if (row_asc_xheights.get_total() > 0) {
1321
    // Determine xheight from rows where ascenders were found.
1322
8.62k
    xheight = row_asc_xheights.median();
1323
8.62k
    ascrise = row_asc_ascrise.median();
1324
8.62k
    descdrop = -row_asc_descdrop.median();
1325
8.62k
  } else if (row_desc_xheights.get_total() > 0) {
1326
    // Determine xheight from rows where descenders were found.
1327
1.40k
    xheight = row_desc_xheights.median();
1328
1.40k
    descdrop = -row_desc_descdrop.median();
1329
5.07k
  } else if (row_cap_xheights.get_total() > 0) {
1330
    // All the rows in the block were (a/de)scenderless.
1331
    // Try to search for two modes in row_cap_heights that could
1332
    // be the xheight and the capheight (e.g. some of the rows
1333
    // were lowercase, but did not have enough (a/de)scenders.
1334
    // If such two modes cannot be found, this block is most
1335
    // likely all caps (or all small caps, in which case the code
1336
    // still works as intended).
1337
3.26k
    compute_xheight_from_modes(
1338
3.26k
        &row_cap_xheights, &row_cap_floating_xheights,
1339
3.26k
        textord_single_height_mode && block->block->classify_rotation().y() == 0.0, min_height,
1340
3.26k
        max_height, &(xheight), &(ascrise));
1341
3.26k
    if (ascrise == 0) { // assume only caps in the whole block
1342
2.40k
      xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
1343
2.40k
    }
1344
3.26k
  } else { // default block sizes
1345
1.81k
    xheight = block->line_size * CCStruct::kXHeightFraction;
1346
1.81k
  }
1347
  // Correct xheight, ascrise and descdrop if necessary.
1348
15.1k
  bool corrected_xheight = false;
1349
15.1k
  if (xheight < textord_min_xheight) {
1350
2.75k
    xheight = static_cast<float>(textord_min_xheight);
1351
2.75k
    corrected_xheight = true;
1352
2.75k
  }
1353
15.1k
  if (corrected_xheight || ascrise <= 0) {
1354
5.62k
    ascrise = xheight * asc_frac_xheight;
1355
5.62k
  }
1356
15.1k
  if (corrected_xheight || descdrop >= 0) {
1357
5.07k
    descdrop = -(xheight * desc_frac_xheight);
1358
5.07k
  }
1359
15.1k
  block->xheight = xheight;
1360
1361
15.1k
  if (textord_debug_xheights) {
1362
0
    tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n", xheight, ascrise,
1363
0
            descdrop);
1364
0
  }
1365
  // Correct xheight, ascrise, descdrop of rows based on block averages.
1366
127k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1367
112k
    correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
1368
112k
  }
1369
15.1k
}
1370
1371
/**
1372
 * @name compute_row_xheight
1373
 *
1374
 * Estimate the xheight of this row.
1375
 * Compute the ascender rise and descender drop at the same time.
1376
 * Set xheigh_evidence to the number of blobs with the chosen xheight
1377
 * that appear in this row.
1378
 */
1379
void Textord::compute_row_xheight(TO_ROW *row, // row to do
1380
                                  const FCOORD &rotation,
1381
                                  float gradient, // global skew
1382
93.5k
                                  int block_line_size) {
1383
  // Find blobs representing repeated characters in rows and mark them.
1384
  // This information is used for computing row xheight and at a later
1385
  // stage when words are formed by make_words.
1386
93.5k
  if (!row->rep_chars_marked()) {
1387
56.2k
    mark_repeated_chars(row);
1388
56.2k
  }
1389
1390
93.5k
  int min_height, max_height;
1391
93.5k
  get_min_max_xheight(block_line_size, &min_height, &max_height);
1392
93.5k
  STATS heights(min_height, max_height);
1393
93.5k
  STATS floating_heights(min_height, max_height);
1394
93.5k
  fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights);
1395
93.5k
  row->ascrise = 0.0f;
1396
93.5k
  row->xheight = 0.0f;
1397
93.5k
  row->xheight_evidence = compute_xheight_from_modes(
1398
93.5k
      &heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height,
1399
93.5k
      max_height, &(row->xheight), &(row->ascrise));
1400
93.5k
  row->descdrop = 0.0f;
1401
93.5k
  if (row->xheight > 0) {
1402
25.7k
    row->descdrop =
1403
25.7k
        static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
1404
25.7k
  }
1405
93.5k
}
1406
1407
/**
1408
 * @name fill_heights
1409
 *
1410
 * Fill the given heights with heights of the blobs that are legal
1411
 * candidates for estimating xheight.
1412
 */
1413
void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights,
1414
108k
                  STATS *floating_heights) {
1415
108k
  float xcentre;  // centre of blob
1416
108k
  float top;      // top y coord of blob
1417
108k
  float height;   // height of blob
1418
108k
  BLOBNBOX *blob; // current blob
1419
108k
  int repeated_set;
1420
108k
  BLOBNBOX_IT blob_it = row->blob_list();
1421
108k
  if (blob_it.empty()) {
1422
0
    return; // no blobs in this row
1423
0
  }
1424
108k
  bool has_rep_chars = row->rep_chars_marked() && row->num_repeated_sets() > 0;
1425
920k
  do {
1426
920k
    blob = blob_it.data();
1427
920k
    if (!blob->joined_to_prev()) {
1428
564k
      xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f;
1429
564k
      top = blob->bounding_box().top();
1430
564k
      height = blob->bounding_box().height();
1431
564k
      if (textord_fix_xheight_bug) {
1432
564k
        top -= row->baseline.y(xcentre);
1433
564k
      } else {
1434
0
        top -= gradient * xcentre + row->parallel_c();
1435
0
      }
1436
564k
      if (top >= min_height && top <= max_height) {
1437
118k
        heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1438
118k
        if (height / top < textord_min_blob_height_fraction) {
1439
37.7k
          floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1440
37.7k
        }
1441
118k
      }
1442
564k
    }
1443
    // Skip repeated chars, since they are likely to skew the height stats.
1444
920k
    if (has_rep_chars && blob->repeated_set() != 0) {
1445
0
      repeated_set = blob->repeated_set();
1446
0
      blob_it.forward();
1447
0
      while (!blob_it.at_first() && blob_it.data()->repeated_set() == repeated_set) {
1448
0
        blob_it.forward();
1449
0
        if (textord_debug_xheights) {
1450
0
          tprintf("Skipping repeated char when computing xheight\n");
1451
0
        }
1452
0
      }
1453
920k
    } else {
1454
920k
      blob_it.forward();
1455
920k
    }
1456
920k
  } while (!blob_it.at_first());
1457
108k
}
1458
1459
/**
1460
 * @name compute_xheight_from_modes
1461
 *
1462
 * Given a STATS object heights, looks for two most frequently occurring
1463
 * heights that look like xheight and xheight + ascrise. If found, sets
1464
 * the values of *xheight and *ascrise accordingly, otherwise sets xheight
1465
 * to any most frequently occurring height and sets *ascrise to 0.
1466
 * Returns the number of times xheight occurred in heights.
1467
 * For each mode that is considered for being an xheight the count of
1468
 * floating blobs (stored in floating_heights) is subtracted from the
1469
 * total count of the blobs of this height. This is done because blobs
1470
 * that sit far above the baseline could represent valid ascenders, but
1471
 * it is highly unlikely that such a character's height will be an xheight
1472
 * (e.g.  -, ', =, ^, `, ", ', etc)
1473
 * If cap_only, then force finding of only the top mode.
1474
 */
1475
int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only,
1476
96.8k
                               int min_height, int max_height, float *xheight, float *ascrise) {
1477
96.8k
  int blob_index = heights->mode();                 // find mode
1478
96.8k
  int blob_count = heights->pile_count(blob_index); // get count of mode
1479
96.8k
  if (textord_debug_xheights) {
1480
0
    tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n", min_height, max_height,
1481
0
            blob_index, blob_count, heights->get_total());
1482
0
    heights->print();
1483
0
    floating_heights->print();
1484
0
  }
1485
96.8k
  if (blob_count == 0) {
1486
67.7k
    return 0;
1487
67.7k
  }
1488
29.0k
  int modes[MAX_HEIGHT_MODES]; // biggest piles
1489
29.0k
  bool in_best_pile = false;
1490
29.0k
  int prev_size = -INT32_MAX;
1491
29.0k
  int best_count = 0;
1492
29.0k
  int mode_count = compute_height_modes(heights, min_height, max_height, modes, MAX_HEIGHT_MODES);
1493
29.0k
  if (cap_only && mode_count > 1) {
1494
0
    mode_count = 1;
1495
0
  }
1496
29.0k
  int x;
1497
29.0k
  if (textord_debug_xheights) {
1498
0
    tprintf("found %d modes: ", mode_count);
1499
0
    for (x = 0; x < mode_count; x++) {
1500
0
      tprintf("%d ", modes[x]);
1501
0
    }
1502
0
    tprintf("\n");
1503
0
  }
1504
1505
57.0k
  for (x = 0; x < mode_count - 1; x++) {
1506
28.0k
    if (modes[x] != prev_size + 1) {
1507
26.3k
      in_best_pile = false; // had empty height
1508
26.3k
    }
1509
28.0k
    int modes_x_count = heights->pile_count(modes[x]) - floating_heights->pile_count(modes[x]);
1510
28.0k
    if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
1511
28.0k
        (in_best_pile || modes_x_count > best_count)) {
1512
37.6k
      for (int asc = x + 1; asc < mode_count; asc++) {
1513
25.7k
        float ratio = static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
1514
25.7k
        if (textord_ascx_ratio_min < ratio && ratio < textord_ascx_ratio_max &&
1515
25.7k
            (heights->pile_count(modes[asc]) >= blob_count * textord_ascheight_mode_fraction)) {
1516
11.1k
          if (modes_x_count > best_count) {
1517
6.12k
            in_best_pile = true;
1518
6.12k
            best_count = modes_x_count;
1519
6.12k
          }
1520
11.1k
          if (textord_debug_xheights) {
1521
0
            tprintf("X=%d, asc=%d, count=%d, ratio=%g\n", modes[x], modes[asc] - modes[x],
1522
0
                    modes_x_count, ratio);
1523
0
          }
1524
11.1k
          prev_size = modes[x];
1525
11.1k
          *xheight = static_cast<float>(modes[x]);
1526
11.1k
          *ascrise = static_cast<float>(modes[asc] - modes[x]);
1527
11.1k
        }
1528
25.7k
      }
1529
11.9k
    }
1530
28.0k
  }
1531
29.0k
  if (*xheight == 0) { // single mode
1532
    // Remove counts of the "floating" blobs (the one whose height is too
1533
    // small in relation to it's top end of the bounding box) from heights
1534
    // before computing the single-mode xheight.
1535
    // Restore the counts in heights after the mode is found, since
1536
    // floating blobs might be useful for determining potential ascenders
1537
    // in compute_row_descdrop().
1538
23.2k
    if (floating_heights->get_total() > 0) {
1539
126k
      for (x = min_height; x < max_height; ++x) {
1540
117k
        heights->add(x, -(floating_heights->pile_count(x)));
1541
117k
      }
1542
8.31k
      blob_index = heights->mode(); // find the modified mode
1543
126k
      for (x = min_height; x < max_height; ++x) {
1544
117k
        heights->add(x, floating_heights->pile_count(x));
1545
117k
      }
1546
8.31k
    }
1547
23.2k
    *xheight = static_cast<float>(blob_index);
1548
23.2k
    *ascrise = 0.0f;
1549
23.2k
    best_count = heights->pile_count(blob_index);
1550
23.2k
    if (textord_debug_xheights) {
1551
0
      tprintf("Single mode xheight set to %g\n", *xheight);
1552
0
    }
1553
23.2k
  } else if (textord_debug_xheights) {
1554
0
    tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1555
0
  }
1556
29.0k
  return best_count;
1557
96.8k
}
1558
1559
/**
1560
 * @name compute_row_descdrop
1561
 *
1562
 * Estimates the descdrop of this row. This function looks for
1563
 * "significant" descenders of lowercase letters (those that could
1564
 * not just be the small descenders of upper case letters like Q,J).
1565
 * The function also takes into account how many potential ascenders
1566
 * this row might contain. If the number of potential ascenders along
1567
 * with descenders is close to the expected fraction of the total
1568
 * number of blobs in the row, the function returns the descender
1569
 * height, returns 0 otherwise.
1570
 */
1571
int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count,
1572
25.7k
                             STATS *asc_heights) {
1573
  // Count how many potential ascenders are in this row.
1574
25.7k
  int i_min = asc_heights->min_bucket();
1575
25.7k
  if ((i_min / row->xheight) < textord_ascx_ratio_min) {
1576
25.1k
    i_min = static_cast<int>(floor(row->xheight * textord_ascx_ratio_min + 0.5));
1577
25.1k
  }
1578
25.7k
  int i_max = asc_heights->max_bucket();
1579
25.7k
  if ((i_max / row->xheight) > textord_ascx_ratio_max) {
1580
1.52k
    i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
1581
1.52k
  }
1582
25.7k
  int num_potential_asc = 0;
1583
52.8k
  for (int i = i_min; i <= i_max; ++i) {
1584
27.1k
    num_potential_asc += asc_heights->pile_count(i);
1585
27.1k
  }
1586
25.7k
  auto min_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5));
1587
25.7k
  auto max_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max));
1588
25.7k
  float xcentre; // centre of blob
1589
25.7k
  float height;  // height of blob
1590
25.7k
  BLOBNBOX_IT blob_it = row->blob_list();
1591
25.7k
  BLOBNBOX *blob; // current blob
1592
25.7k
  STATS heights(min_height, max_height);
1593
382k
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1594
357k
    blob = blob_it.data();
1595
357k
    if (!blob->joined_to_prev()) {
1596
167k
      xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f;
1597
167k
      height = (gradient * xcentre + row->parallel_c() - blob->bounding_box().bottom());
1598
167k
      if (height >= min_height && height <= max_height) {
1599
23.0k
        heights.add(static_cast<int>(floor(height + 0.5)), 1);
1600
23.0k
      }
1601
167k
    }
1602
357k
  }
1603
25.7k
  int blob_index = heights.mode();                 // find mode
1604
25.7k
  int blob_count = heights.pile_count(blob_index); // get count of mode
1605
25.7k
  float total_fraction = (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
1606
25.7k
  if (static_cast<float>(blob_count + num_potential_asc) < xheight_blob_count * total_fraction) {
1607
12.2k
    blob_count = 0;
1608
12.2k
  }
1609
25.7k
  int descdrop = blob_count > 0 ? -blob_index : 0;
1610
25.7k
  if (textord_debug_xheights) {
1611
0
    tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n", descdrop, num_potential_asc,
1612
0
            blob_count);
1613
0
    heights.print();
1614
0
  }
1615
25.7k
  return descdrop;
1616
25.7k
}
1617
1618
/**
1619
 * @name compute_height_modes
1620
 *
1621
 * Find the top maxmodes values in the input array and put their
1622
 * indices in the output in the order in which they occurred.
1623
 */
1624
int32_t compute_height_modes(STATS *heights,     // stats to search
1625
                             int32_t min_height, // bottom of range
1626
                             int32_t max_height, // top of range
1627
                             int32_t *modes,     // output array
1628
29.0k
                             int32_t maxmodes) { // size of modes
1629
29.0k
  int32_t pile_count;                            // no in source pile
1630
29.0k
  int32_t src_count;                             // no of source entries
1631
29.0k
  int32_t src_index;                             // current entry
1632
29.0k
  int32_t least_count;                           // height of smalllest
1633
29.0k
  int32_t least_index;                           // index of least
1634
29.0k
  int32_t dest_count;                            // index in modes
1635
1636
29.0k
  src_count = max_height + 1 - min_height;
1637
29.0k
  dest_count = 0;
1638
29.0k
  least_count = INT32_MAX;
1639
29.0k
  least_index = -1;
1640
937k
  for (src_index = 0; src_index < src_count; src_index++) {
1641
908k
    pile_count = heights->pile_count(min_height + src_index);
1642
908k
    if (pile_count > 0) {
1643
57.4k
      if (dest_count < maxmodes) {
1644
57.0k
        if (pile_count < least_count) {
1645
          // find smallest in array
1646
32.2k
          least_count = pile_count;
1647
32.2k
          least_index = dest_count;
1648
32.2k
        }
1649
57.0k
        modes[dest_count++] = min_height + src_index;
1650
57.0k
      } else if (pile_count >= least_count) {
1651
2.07k
        while (least_index < maxmodes - 1) {
1652
1.70k
          modes[least_index] = modes[least_index + 1];
1653
          // shuffle up
1654
1.70k
          least_index++;
1655
1.70k
        }
1656
        // new one on end
1657
373
        modes[maxmodes - 1] = min_height + src_index;
1658
373
        if (pile_count == least_count) {
1659
          // new smallest
1660
236
          least_index = maxmodes - 1;
1661
236
        } else {
1662
137
          least_count = heights->pile_count(modes[0]);
1663
137
          least_index = 0;
1664
1.64k
          for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1665
1.50k
            pile_count = heights->pile_count(modes[dest_count]);
1666
1.50k
            if (pile_count < least_count) {
1667
              // find smallest
1668
127
              least_count = pile_count;
1669
127
              least_index = dest_count;
1670
127
            }
1671
1.50k
          }
1672
137
        }
1673
373
      }
1674
57.4k
    }
1675
908k
  }
1676
29.0k
  return dest_count;
1677
29.0k
}
1678
1679
/**
1680
 * @name correct_row_xheight
1681
 *
1682
 * Adjust the xheight etc of this row if not within reasonable limits
1683
 * of the average for the block.
1684
 */
1685
112k
void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop) {
1686
112k
  ROW_CATEGORY row_category = get_row_category(row);
1687
112k
  if (textord_debug_xheights) {
1688
0
    tprintf(
1689
0
        "correcting row xheight: row->xheight %.4f"
1690
0
        ", row->acrise %.4f row->descdrop %.4f\n",
1691
0
        row->xheight, row->ascrise, row->descdrop);
1692
0
  }
1693
112k
  bool normal_xheight = within_error_margin(row->xheight, xheight, textord_xheight_error_margin);
1694
112k
  bool cap_xheight =
1695
112k
      within_error_margin(row->xheight, xheight + ascrise, textord_xheight_error_margin);
1696
  // Use the average xheight/ascrise for the following cases:
1697
  // -- the xheight of the row could not be determined at all
1698
  // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
1699
  //    and its xheight is close to either cap height or average xheight
1700
  // -- the row does not have ascenders or descenders, but its xheight
1701
  //    is close to the average block xheight (e.g. row with "www.mmm.com")
1702
112k
  if (row_category == ROW_ASCENDERS_FOUND) {
1703
61.1k
    if (row->descdrop >= 0) {
1704
2.11k
      row->descdrop = row->xheight * (descdrop / xheight);
1705
2.11k
    }
1706
61.1k
  } else if (row_category == ROW_INVALID ||
1707
51.4k
             (row_category == ROW_DESCENDERS_FOUND && (normal_xheight || cap_xheight)) ||
1708
51.4k
             (row_category == ROW_UNKNOWN && normal_xheight)) {
1709
39.0k
    if (textord_debug_xheights) {
1710
0
      tprintf("using average xheight\n");
1711
0
    }
1712
39.0k
    row->xheight = xheight;
1713
39.0k
    row->ascrise = ascrise;
1714
39.0k
    row->descdrop = descdrop;
1715
39.0k
  } else if (row_category == ROW_DESCENDERS_FOUND) {
1716
    // Assume this is a row with mostly lowercase letters and it's xheight
1717
    // is computed correctly (unfortunately there is no way to distinguish
1718
    // this from the case when descenders are found, but the most common
1719
    // height is capheight).
1720
2.35k
    if (textord_debug_xheights) {
1721
0
      tprintf("lowercase, corrected ascrise\n");
1722
0
    }
1723
2.35k
    row->ascrise = row->xheight * (ascrise / xheight);
1724
10.0k
  } else if (row_category == ROW_UNKNOWN) {
1725
    // Otherwise assume this row is an all-caps or small-caps row
1726
    // and adjust xheight and ascrise of the row.
1727
1728
10.0k
    row->all_caps = true;
1729
10.0k
    if (cap_xheight) { // regular all caps
1730
2.87k
      if (textord_debug_xheights) {
1731
0
        tprintf("all caps\n");
1732
0
      }
1733
2.87k
      row->xheight = xheight;
1734
2.87k
      row->ascrise = ascrise;
1735
2.87k
      row->descdrop = descdrop;
1736
7.16k
    } else { // small caps or caps with an odd xheight
1737
7.16k
      if (textord_debug_xheights) {
1738
0
        if (row->xheight < xheight + ascrise && row->xheight > xheight) {
1739
0
          tprintf("small caps\n");
1740
0
        } else {
1741
0
          tprintf("all caps with irregular xheight\n");
1742
0
        }
1743
0
      }
1744
7.16k
      row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
1745
7.16k
      row->xheight -= row->ascrise;
1746
7.16k
      row->descdrop = row->xheight * (descdrop / xheight);
1747
7.16k
    }
1748
10.0k
  }
1749
112k
  if (textord_debug_xheights) {
1750
0
    tprintf(
1751
0
        "corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1752
0
        " = %.4f\n",
1753
0
        row->xheight, row->ascrise, row->descdrop);
1754
0
  }
1755
112k
}
1756
1757
7.32k
static int CountOverlaps(const TBOX &box, int min_height, BLOBNBOX_LIST *blobs) {
1758
7.32k
  int overlaps = 0;
1759
7.32k
  BLOBNBOX_IT blob_it(blobs);
1760
85.8k
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1761
78.5k
    BLOBNBOX *blob = blob_it.data();
1762
78.5k
    const TBOX &blob_box = blob->bounding_box();
1763
78.5k
    if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
1764
8.39k
      ++overlaps;
1765
8.39k
    }
1766
78.5k
  }
1767
7.32k
  return overlaps;
1768
7.32k
}
1769
1770
/**
1771
 * @name separate_underlines
1772
 *
1773
 * Test wide objects for being potential underlines. If they are then
1774
 * put them in a separate list in the block.
1775
 */
1776
void separate_underlines(TO_BLOCK *block,   // block to do
1777
                         float gradient,    // skew angle
1778
                         FCOORD rotation,   // inverse landscape
1779
7.74k
                         bool testing_on) { // correct orientation
1780
7.74k
  BLOBNBOX *blob;                           // current blob
1781
7.74k
  C_BLOB *rotated_blob;                     // rotated blob
1782
7.74k
  TO_ROW *row;                              // current row
1783
7.74k
  float length;                             // of g_vec
1784
7.74k
  TBOX blob_box;
1785
7.74k
  FCOORD blob_rotation; // inverse of rotation
1786
7.74k
  FCOORD g_vec;         // skew rotation
1787
7.74k
  BLOBNBOX_IT blob_it;  // iterator
1788
                        // iterator
1789
7.74k
  BLOBNBOX_IT under_it = &block->underlines;
1790
7.74k
  BLOBNBOX_IT large_it = &block->large_blobs;
1791
7.74k
  TO_ROW_IT row_it = block->get_rows();
1792
7.74k
  int min_blob_height = static_cast<int>(textord_min_blob_height_fraction * block->line_size + 0.5);
1793
1794
  // length of vector
1795
7.74k
  length = std::sqrt(1 + gradient * gradient);
1796
7.74k
  g_vec = FCOORD(1 / length, -gradient / length);
1797
7.74k
  blob_rotation = FCOORD(rotation.x(), -rotation.y());
1798
7.74k
  blob_rotation.rotate(g_vec); // undoing everything
1799
64.0k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1800
56.3k
    row = row_it.data();
1801
    // get blobs
1802
56.3k
    blob_it.set_to_list(row->blob_list());
1803
578k
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1804
522k
      blob = blob_it.data();
1805
522k
      blob_box = blob->bounding_box();
1806
522k
      if (blob_box.width() > block->line_size * textord_underline_width) {
1807
14.4k
        ASSERT_HOST(blob->cblob() != nullptr);
1808
14.4k
        rotated_blob = crotate_cblob(blob->cblob(), blob_rotation);
1809
14.4k
        if (test_underline(testing_on && textord_show_final_rows, rotated_blob,
1810
14.4k
                           static_cast<int16_t>(row->intercept()),
1811
14.4k
                           static_cast<int16_t>(block->line_size *
1812
14.4k
                                                (tesseract::CCStruct::kXHeightFraction +
1813
14.4k
                                                 tesseract::CCStruct::kAscenderFraction / 2.0f)))) {
1814
7.14k
          under_it.add_after_then_move(blob_it.extract());
1815
7.14k
          if (testing_on && textord_show_final_rows) {
1816
0
            tprintf("Underlined blob at:");
1817
0
            rotated_blob->bounding_box().print();
1818
0
            tprintf("Was:");
1819
0
            blob_box.print();
1820
0
          }
1821
7.32k
        } else if (CountOverlaps(blob->bounding_box(), min_blob_height, row->blob_list()) >
1822
7.32k
                   textord_max_blob_overlaps) {
1823
124
          large_it.add_after_then_move(blob_it.extract());
1824
124
          if (testing_on && textord_show_final_rows) {
1825
0
            tprintf("Large blob overlaps %d blobs at:",
1826
0
                    CountOverlaps(blob_box, min_blob_height, row->blob_list()));
1827
0
            blob_box.print();
1828
0
          }
1829
124
        }
1830
14.4k
        delete rotated_blob;
1831
14.4k
      }
1832
522k
    }
1833
56.3k
  }
1834
7.74k
}
1835
1836
/**
1837
 * @name pre_associate_blobs
1838
 *
1839
 * Associate overlapping blobs and fake chop wide blobs.
1840
 */
1841
void pre_associate_blobs( // make rough chars
1842
    ICOORD page_tr,       // top right
1843
    TO_BLOCK *block,      // block to do
1844
    FCOORD rotation,      // inverse landscape
1845
    bool testing_on       // correct orientation
1846
7.74k
) {
1847
#ifndef GRAPHICS_DISABLED
1848
  ScrollView::Color colour; // of boxes
1849
#endif
1850
7.74k
  BLOBNBOX *blob;     // current blob
1851
7.74k
  BLOBNBOX *nextblob; // next in list
1852
7.74k
  TBOX blob_box;
1853
7.74k
  FCOORD blob_rotation; // inverse of rotation
1854
7.74k
  BLOBNBOX_IT blob_it;  // iterator
1855
7.74k
  BLOBNBOX_IT start_it; // iterator
1856
7.74k
  TO_ROW_IT row_it = block->get_rows();
1857
1858
#ifndef GRAPHICS_DISABLED
1859
  colour = ScrollView::RED;
1860
#endif
1861
1862
7.74k
  blob_rotation = FCOORD(rotation.x(), -rotation.y());
1863
64.0k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1864
    // get blobs
1865
56.3k
    blob_it.set_to_list(row_it.data()->blob_list());
1866
368k
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1867
311k
      blob = blob_it.data();
1868
311k
      blob_box = blob->bounding_box();
1869
311k
      start_it = blob_it; // save start point
1870
      //                      if (testing_on && textord_show_final_blobs)
1871
      //                      {
1872
      //                              tprintf("Blob at (%d,%d)->(%d,%d),
1873
      //                              addr=%x, count=%d\n",
1874
      //                                      blob_box.left(),blob_box.bottom(),
1875
      //                                      blob_box.right(),blob_box.top(),
1876
      //                                      (void*)blob,blob_it.length());
1877
      //                      }
1878
311k
      bool overlap;
1879
545k
      do {
1880
545k
        overlap = false;
1881
545k
        if (!blob_it.at_last()) {
1882
484k
          nextblob = blob_it.data_relative(1);
1883
484k
          overlap = blob_box.major_x_overlap(nextblob->bounding_box());
1884
484k
          if (overlap) {
1885
233k
            blob->merge(nextblob);           // merge new blob
1886
233k
            blob_box = blob->bounding_box(); // get bigger box
1887
233k
            blob_it.forward();
1888
233k
          }
1889
484k
        }
1890
545k
      } while (overlap);
1891
311k
      blob->chop(&start_it, &blob_it, blob_rotation,
1892
311k
                 block->line_size * tesseract::CCStruct::kXHeightFraction * textord_chop_width);
1893
      // attempt chop
1894
311k
    }
1895
#ifndef GRAPHICS_DISABLED
1896
    if (testing_on && textord_show_final_blobs) {
1897
      if (to_win == nullptr) {
1898
        create_to_win(page_tr);
1899
      }
1900
      to_win->Pen(colour);
1901
      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1902
        blob = blob_it.data();
1903
        blob_box = blob->bounding_box();
1904
        blob_box.rotate(rotation);
1905
        if (!blob->joined_to_prev()) {
1906
          to_win->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top());
1907
        }
1908
      }
1909
      colour = static_cast<ScrollView::Color>(colour + 1);
1910
      if (colour > ScrollView::MAGENTA) {
1911
        colour = ScrollView::RED;
1912
      }
1913
    }
1914
#endif
1915
56.3k
  }
1916
7.74k
}
1917
1918
/**
1919
 * @name fit_parallel_rows
1920
 *
1921
 * Re-fit the rows in the block to the given gradient.
1922
 */
1923
void fit_parallel_rows( // find lines
1924
    TO_BLOCK *block,    // block to do
1925
    float gradient,     // gradient to fit
1926
    FCOORD rotation,    // for drawing
1927
    int32_t block_edge, // edge of block
1928
    bool testing_on     // correct orientation
1929
12.8k
) {
1930
#ifndef GRAPHICS_DISABLED
1931
  ScrollView::Color colour; // of row
1932
#endif
1933
12.8k
  TO_ROW_IT row_it = block->get_rows();
1934
1935
12.8k
  row_it.move_to_first();
1936
114k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1937
101k
    if (row_it.data()->blob_list()->empty()) {
1938
0
      delete row_it.extract(); // nothing in it
1939
101k
    } else {
1940
101k
      fit_parallel_lms(gradient, row_it.data());
1941
101k
    }
1942
101k
  }
1943
#ifndef GRAPHICS_DISABLED
1944
  if (testing_on) {
1945
    colour = ScrollView::RED;
1946
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1947
      plot_parallel_row(row_it.data(), gradient, block_edge, colour, rotation);
1948
      colour = static_cast<ScrollView::Color>(colour + 1);
1949
      if (colour > ScrollView::MAGENTA) {
1950
        colour = ScrollView::RED;
1951
      }
1952
    }
1953
  }
1954
#endif
1955
12.8k
  row_it.sort(row_y_order); // may have gone out of order
1956
12.8k
}
1957
1958
/**
1959
 * @name fit_parallel_lms
1960
 *
1961
 * Fit an LMS line to a row.
1962
 * Make the fit parallel to the given gradient and set the
1963
 * row accordingly.
1964
 */
1965
101k
void fit_parallel_lms(float gradient, TO_ROW *row) {
1966
101k
  float c;       // fitted line
1967
101k
  int blobcount; // no of blobs
1968
101k
  tesseract::DetLineFit lms;
1969
101k
  BLOBNBOX_IT blob_it = row->blob_list();
1970
1971
101k
  blobcount = 0;
1972
509k
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1973
407k
    if (!blob_it.data()->joined_to_prev()) {
1974
407k
      const TBOX &box = blob_it.data()->bounding_box();
1975
407k
      lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
1976
407k
      blobcount++;
1977
407k
    }
1978
407k
  }
1979
101k
  double error = lms.ConstrainedFit(gradient, &c);
1980
101k
  row->set_parallel_line(gradient, c, error);
1981
101k
  if (textord_straight_baselines && blobcount > textord_lms_line_trials) {
1982
0
    error = lms.Fit(&gradient, &c);
1983
0
  }
1984
  // set the other too
1985
101k
  row->set_line(gradient, c, error);
1986
101k
}
1987
1988
/**
1989
 * @name make_spline_rows
1990
 *
1991
 * Re-fit the rows in the block to the given gradient.
1992
 */
1993
void Textord::make_spline_rows(TO_BLOCK *block, // block to do
1994
                               float gradient,  // gradient to fit
1995
7.74k
                               bool testing_on) {
1996
#ifndef GRAPHICS_DISABLED
1997
  ScrollView::Color colour; // of row
1998
  if (testing_on && to_win == nullptr) {
1999
    create_to_win(page_tr_);
2000
  }
2001
#endif
2002
7.74k
  TO_ROW_IT row_it = block->get_rows();
2003
2004
7.74k
  row_it.move_to_first();
2005
64.0k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2006
56.3k
    if (row_it.data()->blob_list()->empty()) {
2007
40
      delete row_it.extract(); // nothing in it
2008
56.2k
    } else {
2009
56.2k
      make_baseline_spline(row_it.data(), block);
2010
56.2k
    }
2011
56.3k
  }
2012
7.74k
  if (textord_old_baselines) {
2013
#ifndef GRAPHICS_DISABLED
2014
    if (testing_on) {
2015
      colour = ScrollView::RED;
2016
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2017
        row_it.data()->baseline.plot(to_win, colour);
2018
        colour = static_cast<ScrollView::Color>(colour + 1);
2019
        if (colour > ScrollView::MAGENTA) {
2020
          colour = ScrollView::RED;
2021
        }
2022
      }
2023
    }
2024
#endif
2025
7.74k
    make_old_baselines(block, testing_on, gradient);
2026
7.74k
  }
2027
#ifndef GRAPHICS_DISABLED
2028
  if (testing_on) {
2029
    colour = ScrollView::RED;
2030
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2031
      row_it.data()->baseline.plot(to_win, colour);
2032
      colour = static_cast<ScrollView::Color>(colour + 1);
2033
      if (colour > ScrollView::MAGENTA) {
2034
        colour = ScrollView::RED;
2035
      }
2036
    }
2037
  }
2038
#endif
2039
7.74k
}
2040
2041
/**
2042
 * @name make_baseline_spline
2043
 *
2044
 * Fit an LMS line to a row.
2045
 * Make the fit parallel to the given gradient and set the
2046
 * row accordingly.
2047
 */
2048
void make_baseline_spline(TO_ROW *row, // row to fit
2049
56.2k
                          TO_BLOCK *block) {
2050
56.2k
  double *coeffs;   // quadratic coeffs
2051
56.2k
  int32_t segments; // no of segments
2052
2053
  // spline boundaries
2054
56.2k
  auto *xstarts = new int32_t[row->blob_list()->length() + 1];
2055
56.2k
  if (segment_baseline(row, block, segments, xstarts) && !textord_straight_baselines &&
2056
56.2k
      !textord_parallel_baselines) {
2057
0
    coeffs = linear_spline_baseline(row, block, segments, xstarts);
2058
56.2k
  } else {
2059
56.2k
    xstarts[1] = xstarts[segments];
2060
56.2k
    segments = 1;
2061
56.2k
    coeffs = new double[3];
2062
56.2k
    coeffs[0] = 0;
2063
56.2k
    coeffs[1] = row->line_m();
2064
56.2k
    coeffs[2] = row->line_c();
2065
56.2k
  }
2066
56.2k
  row->baseline = QSPLINE(segments, xstarts, coeffs);
2067
56.2k
  delete[] coeffs;
2068
56.2k
  delete[] xstarts;
2069
56.2k
}
2070
2071
/**
2072
 * @name segment_baseline
2073
 *
2074
 * Divide the baseline up into segments which require a different
2075
 * quadratic fitted to them.
2076
 * Return true if enough blobs were far enough away to need a quadratic.
2077
 */
2078
bool segment_baseline( // split baseline
2079
    TO_ROW *row,       // row to fit
2080
    TO_BLOCK *block,   // block it came from
2081
    int32_t &segments, // no fo segments
2082
    int32_t *xstarts   // coords of segments
2083
56.2k
) {
2084
56.2k
  bool needs_curve; // needs curved line
2085
56.2k
  int blobcount;    // no of blobs
2086
56.2k
  int blobindex;    // current blob
2087
56.2k
  int last_state;   // above, on , below
2088
56.2k
  int state;        // of current blob
2089
56.2k
  float yshift;     // from baseline
2090
56.2k
  TBOX box;         // blob box
2091
56.2k
  TBOX new_box;     // new_it box
2092
56.2k
  float middle;     // xcentre of blob
2093
                    // blobs
2094
56.2k
  BLOBNBOX_IT blob_it = row->blob_list();
2095
56.2k
  BLOBNBOX_IT new_it = blob_it; // front end
2096
56.2k
  SORTED_FLOATS yshifts;        // shifts from baseline
2097
2098
56.2k
  needs_curve = false;
2099
56.2k
  box = box_next_pre_chopped(&blob_it);
2100
56.2k
  xstarts[0] = box.left();
2101
56.2k
  segments = 1;
2102
56.2k
  blobcount = row->blob_list()->length();
2103
56.2k
  if (textord_oldbl_debug) {
2104
0
    tprintf("Segmenting baseline of %d blobs at (%d,%d)\n", blobcount, box.left(), box.bottom());
2105
0
  }
2106
56.2k
  if (blobcount <= textord_spline_medianwin || blobcount < textord_spline_minblobs) {
2107
36.6k
    blob_it.move_to_last();
2108
36.6k
    box = blob_it.data()->bounding_box();
2109
36.6k
    xstarts[1] = box.right();
2110
36.6k
    return false;
2111
36.6k
  }
2112
19.6k
  last_state = 0;
2113
19.6k
  new_it.mark_cycle_pt();
2114
121k
  for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2115
107k
    new_box = box_next_pre_chopped(&new_it);
2116
107k
    middle = (new_box.left() + new_box.right()) / 2.0;
2117
107k
    yshift = new_box.bottom() - row->line_m() * middle - row->line_c();
2118
    // record shift
2119
107k
    yshifts.add(yshift, blobindex);
2120
107k
    if (new_it.cycled_list()) {
2121
5.21k
      xstarts[1] = new_box.right();
2122
5.21k
      return false;
2123
5.21k
    }
2124
107k
  }
2125
57.8k
  for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++) {
2126
43.3k
    box = box_next_pre_chopped(&blob_it);
2127
43.3k
  }
2128
119k
  do {
2129
119k
    new_box = box_next_pre_chopped(&new_it);
2130
    // get middle one
2131
119k
    yshift = yshifts[textord_spline_medianwin / 2];
2132
119k
    if (yshift > textord_spline_shift_fraction * block->line_size) {
2133
53.3k
      state = 1;
2134
66.1k
    } else if (-yshift > textord_spline_shift_fraction * block->line_size) {
2135
30.8k
      state = -1;
2136
35.3k
    } else {
2137
35.3k
      state = 0;
2138
35.3k
    }
2139
119k
    if (state != 0) {
2140
84.2k
      needs_curve = true;
2141
84.2k
    }
2142
    //              tprintf("State=%d, prev=%d, shift=%g\n",
2143
    //                      state,last_state,yshift);
2144
119k
    if (state != last_state && blobcount > textord_spline_minblobs) {
2145
2.50k
      xstarts[segments++] = box.left();
2146
2.50k
      blobcount = 0;
2147
2.50k
    }
2148
119k
    last_state = state;
2149
119k
    yshifts.remove(blobindex - textord_spline_medianwin);
2150
119k
    box = box_next_pre_chopped(&blob_it);
2151
119k
    middle = (new_box.left() + new_box.right()) / 2.0;
2152
119k
    yshift = new_box.bottom() - row->line_m() * middle - row->line_c();
2153
119k
    yshifts.add(yshift, blobindex);
2154
119k
    blobindex++;
2155
119k
    blobcount++;
2156
119k
  } while (!new_it.cycled_list());
2157
14.4k
  if (blobcount > textord_spline_minblobs || segments == 1) {
2158
12.7k
    xstarts[segments] = new_box.right();
2159
12.7k
  } else {
2160
1.73k
    xstarts[--segments] = new_box.right();
2161
1.73k
  }
2162
14.4k
  if (textord_oldbl_debug) {
2163
0
    tprintf("Made %d segments on row at (%d,%d)\n", segments, box.right(), box.bottom());
2164
0
  }
2165
14.4k
  return needs_curve;
2166
19.6k
}
2167
2168
/**
2169
 * @name linear_spline_baseline
2170
 *
2171
 * Divide the baseline up into segments which require a different
2172
 * quadratic fitted to them.
2173
 * @return true if enough blobs were far enough away to need a quadratic.
2174
 */
2175
double *linear_spline_baseline( // split baseline
2176
    TO_ROW *row,                // row to fit
2177
    TO_BLOCK *block,            // block it came from
2178
    int32_t &segments,          // no fo segments
2179
    int32_t xstarts[]           // coords of segments
2180
0
) {
2181
0
  int blobcount;         // no of blobs
2182
0
  int blobindex;         // current blob
2183
0
  int index1, index2;    // blob numbers
2184
0
  int blobs_per_segment; // blobs in each
2185
0
  TBOX box;              // blob box
2186
0
  TBOX new_box;          // new_it box
2187
                         // blobs
2188
0
  BLOBNBOX_IT blob_it = row->blob_list();
2189
0
  BLOBNBOX_IT new_it = blob_it; // front end
2190
0
  float b, c;                   // fitted curve
2191
0
  tesseract::DetLineFit lms;
2192
0
  int32_t segment; // current segment
2193
2194
0
  box = box_next_pre_chopped(&blob_it);
2195
0
  xstarts[0] = box.left();
2196
0
  blobcount = 1;
2197
0
  while (!blob_it.at_first()) {
2198
0
    blobcount++;
2199
0
    box = box_next_pre_chopped(&blob_it);
2200
0
  }
2201
0
  segments = blobcount / textord_spline_medianwin;
2202
0
  if (segments < 1) {
2203
0
    segments = 1;
2204
0
  }
2205
0
  blobs_per_segment = blobcount / segments;
2206
  // quadratic coeffs
2207
0
  auto *coeffs = new double[segments * 3];
2208
0
  if (textord_oldbl_debug) {
2209
0
    tprintf(
2210
0
        "Linear splining baseline of %d blobs at (%d,%d), into %d segments of "
2211
0
        "%d blobs\n",
2212
0
        blobcount, box.left(), box.bottom(), segments, blobs_per_segment);
2213
0
  }
2214
0
  segment = 1;
2215
0
  for (index2 = 0; index2 < blobs_per_segment / 2; index2++) {
2216
0
    box_next_pre_chopped(&new_it);
2217
0
  }
2218
0
  index1 = 0;
2219
0
  blobindex = index2;
2220
0
  do {
2221
0
    blobindex += blobs_per_segment;
2222
0
    lms.Clear();
2223
0
    while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2224
0
      box = box_next_pre_chopped(&blob_it);
2225
0
      int middle = (box.left() + box.right()) / 2;
2226
0
      lms.Add(ICOORD(middle, box.bottom()));
2227
0
      index1++;
2228
0
      if (index1 == blobindex - blobs_per_segment / 2 || index1 == blobcount - 1) {
2229
0
        xstarts[segment] = box.left();
2230
0
      }
2231
0
    }
2232
0
    lms.Fit(&b, &c);
2233
0
    coeffs[segment * 3 - 3] = 0;
2234
0
    coeffs[segment * 3 - 2] = b;
2235
0
    coeffs[segment * 3 - 1] = c;
2236
0
    segment++;
2237
0
    if (segment > segments) {
2238
0
      break;
2239
0
    }
2240
2241
0
    blobindex += blobs_per_segment;
2242
0
    lms.Clear();
2243
0
    while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2244
0
      new_box = box_next_pre_chopped(&new_it);
2245
0
      int middle = (new_box.left() + new_box.right()) / 2;
2246
0
      lms.Add(ICOORD(middle, new_box.bottom()));
2247
0
      index2++;
2248
0
      if (index2 == blobindex - blobs_per_segment / 2 || index2 == blobcount - 1) {
2249
0
        xstarts[segment] = new_box.left();
2250
0
      }
2251
0
    }
2252
0
    lms.Fit(&b, &c);
2253
0
    coeffs[segment * 3 - 3] = 0;
2254
0
    coeffs[segment * 3 - 2] = b;
2255
0
    coeffs[segment * 3 - 1] = c;
2256
0
    segment++;
2257
0
  } while (segment <= segments);
2258
0
  return coeffs;
2259
0
}
2260
2261
/**
2262
 * @name assign_blobs_to_rows
2263
 *
2264
 * Make enough rows to allocate all the given blobs to one.
2265
 * If a block skew is given, use that, else attempt to track it.
2266
 */
2267
void assign_blobs_to_rows( // find lines
2268
    TO_BLOCK *block,       // block to do
2269
    float *gradient,       // block skew
2270
    int pass,              // identification
2271
    bool reject_misses,    // chuck big ones out
2272
    bool make_new_rows,    // add rows for unmatched
2273
    bool drawing_skew      // draw smoothed skew
2274
36.1k
) {
2275
36.1k
  OVERLAP_STATE overlap_result; // what to do with it
2276
36.1k
  float ycoord;                 // current y
2277
36.1k
  float top, bottom;            // of blob
2278
36.1k
  float g_length = 1.0f;        // from gradient
2279
36.1k
  int16_t row_count;            // no of rows
2280
36.1k
  int16_t left_x;               // left edge
2281
36.1k
  int16_t last_x;               // previous edge
2282
36.1k
  float block_skew;             // y delta
2283
36.1k
  float smooth_factor;          // for new coords
2284
36.1k
  float near_dist;              // dist to nearest row
2285
36.1k
  ICOORD testpt;                // testing only
2286
36.1k
  BLOBNBOX *blob;               // current blob
2287
36.1k
  TO_ROW *row;                  // current row
2288
36.1k
  TO_ROW *dest_row = nullptr;   // row to put blob in
2289
                                // iterators
2290
36.1k
  BLOBNBOX_IT blob_it = &block->blobs;
2291
36.1k
  TO_ROW_IT row_it = block->get_rows();
2292
2293
36.1k
  ycoord =
2294
36.1k
      (block->block->pdblk.bounding_box().bottom() + block->block->pdblk.bounding_box().top()) /
2295
36.1k
      2.0f;
2296
36.1k
  if (gradient != nullptr) {
2297
28.3k
    g_length = std::sqrt(1 + *gradient * *gradient);
2298
28.3k
  }
2299
#ifndef GRAPHICS_DISABLED
2300
  if (drawing_skew) {
2301
    to_win->SetCursor(block->block->pdblk.bounding_box().left(), ycoord);
2302
  }
2303
#endif
2304
36.1k
  testpt = ICOORD(textord_test_x, textord_test_y);
2305
36.1k
  blob_it.sort(blob_x_order);
2306
36.1k
  smooth_factor = 1.0;
2307
36.1k
  block_skew = 0.0f;
2308
36.1k
  row_count = row_it.length(); // might have rows
2309
36.1k
  if (!blob_it.empty()) {
2310
33.4k
    left_x = blob_it.data()->bounding_box().left();
2311
33.4k
  } else {
2312
2.67k
    left_x = block->block->pdblk.bounding_box().left();
2313
2.67k
  }
2314
36.1k
  last_x = left_x;
2315
1.09M
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
2316
1.05M
    blob = blob_it.data();
2317
1.05M
    if (gradient != nullptr) {
2318
819k
      block_skew = (1 - 1 / g_length) * blob->bounding_box().bottom() +
2319
819k
                   *gradient / g_length * blob->bounding_box().left();
2320
819k
    } else if (blob->bounding_box().left() - last_x > block->line_size / 2 &&
2321
237k
               last_x - left_x > block->line_size * 2 && textord_interpolating_skew) {
2322
      //                      tprintf("Interpolating skew from %g",block_skew);
2323
457
      block_skew *= static_cast<float>(blob->bounding_box().left() - left_x) / (last_x - left_x);
2324
      //                      tprintf("to %g\n",block_skew);
2325
457
    }
2326
1.05M
    last_x = blob->bounding_box().left();
2327
1.05M
    top = blob->bounding_box().top() - block_skew;
2328
1.05M
    bottom = blob->bounding_box().bottom() - block_skew;
2329
#ifndef GRAPHICS_DISABLED
2330
    if (drawing_skew) {
2331
      to_win->DrawTo(blob->bounding_box().left(), ycoord + block_skew);
2332
    }
2333
#endif
2334
1.05M
    if (!row_it.empty()) {
2335
5.12M
      for (row_it.move_to_first(); !row_it.at_last() && row_it.data()->min_y() > top;
2336
4.10M
           row_it.forward()) {
2337
4.10M
      }
2338
1.01M
      row = row_it.data();
2339
1.01M
      if (row->min_y() <= top && row->max_y() >= bottom) {
2340
        // any overlap
2341
859k
        dest_row = row;
2342
859k
        overlap_result = most_overlapping_row(&row_it, dest_row, top, bottom, block->line_size,
2343
859k
                                              blob->bounding_box().contains(testpt));
2344
859k
        if (overlap_result == NEW_ROW && !reject_misses) {
2345
15.3k
          overlap_result = ASSIGN;
2346
15.3k
        }
2347
859k
      } else {
2348
156k
        overlap_result = NEW_ROW;
2349
156k
        if (!make_new_rows) {
2350
92.9k
          near_dist = row_it.data_relative(-1)->min_y() - top;
2351
          // below bottom
2352
92.9k
          if (bottom < row->min_y()) {
2353
20.5k
            if (row->min_y() - bottom <= (block->line_spacing - block->line_size) *
2354
20.5k
                                             tesseract::CCStruct::kDescenderFraction) {
2355
              // done it
2356
1.77k
              overlap_result = ASSIGN;
2357
1.77k
              dest_row = row;
2358
1.77k
            }
2359
72.4k
          } else if (near_dist > 0 && near_dist < bottom - row->max_y()) {
2360
13.6k
            row_it.backward();
2361
13.6k
            dest_row = row_it.data();
2362
13.6k
            if (dest_row->min_y() - bottom <= (block->line_spacing - block->line_size) *
2363
13.6k
                                                  tesseract::CCStruct::kDescenderFraction) {
2364
              // done it
2365
1.82k
              overlap_result = ASSIGN;
2366
1.82k
            }
2367
58.7k
          } else {
2368
58.7k
            if (top - row->max_y() <=
2369
58.7k
                (block->line_spacing - block->line_size) *
2370
58.7k
                    (textord_overlap_x + tesseract::CCStruct::kAscenderFraction)) {
2371
              // done it
2372
17.4k
              overlap_result = ASSIGN;
2373
17.4k
              dest_row = row;
2374
17.4k
            }
2375
58.7k
          }
2376
92.9k
        }
2377
156k
      }
2378
1.01M
      if (overlap_result == ASSIGN) {
2379
832k
        dest_row->add_blob(blob_it.extract(), top, bottom, block->line_size);
2380
832k
      }
2381
1.01M
      if (overlap_result == NEW_ROW) {
2382
180k
        if (make_new_rows && top - bottom < block->max_blob_size) {
2383
87.6k
          dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
2384
87.6k
          row_count++;
2385
87.6k
          if (bottom > row_it.data()->min_y()) {
2386
59.2k
            row_it.add_before_then_move(dest_row);
2387
          // insert in right place
2388
59.2k
          } else {
2389
28.4k
            row_it.add_after_then_move(dest_row);
2390
28.4k
          }
2391
87.6k
          smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset);
2392
92.8k
        } else {
2393
92.8k
          overlap_result = REJECT;
2394
92.8k
        }
2395
180k
      }
2396
1.01M
    } else if (make_new_rows && top - bottom < block->max_blob_size) {
2397
9.96k
      overlap_result = NEW_ROW;
2398
9.96k
      dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
2399
9.96k
      row_count++;
2400
9.96k
      row_it.add_after_then_move(dest_row);
2401
9.96k
      smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset2);
2402
30.6k
    } else {
2403
30.6k
      overlap_result = REJECT;
2404
30.6k
    }
2405
1.05M
    if (blob->bounding_box().contains(testpt) && textord_debug_blob) {
2406
0
      if (overlap_result != REJECT) {
2407
0
        tprintf("Test blob assigned to row at (%g,%g) on pass %d\n", dest_row->min_y(),
2408
0
                dest_row->max_y(), pass);
2409
0
      } else {
2410
0
        tprintf("Test blob assigned to no row on pass %d\n", pass);
2411
0
      }
2412
0
    }
2413
1.05M
    if (overlap_result != REJECT) {
2414
930k
      while (!row_it.at_first() && row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
2415
13
        row = row_it.extract();
2416
13
        row_it.backward();
2417
13
        row_it.add_before_then_move(row);
2418
13
      }
2419
930k
      while (!row_it.at_last() && row_it.data()->min_y() < row_it.data_relative(1)->min_y()) {
2420
27
        row = row_it.extract();
2421
27
        row_it.forward();
2422
        // Keep rows in order.
2423
27
        row_it.add_after_then_move(row);
2424
27
      }
2425
930k
      BLOBNBOX_IT added_blob_it(dest_row->blob_list());
2426
930k
      added_blob_it.move_to_last();
2427
930k
      TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
2428
930k
      if (dest_row->blob_list()->singleton() || !prev_box.major_x_overlap(blob->bounding_box())) {
2429
719k
        block_skew = (1 - smooth_factor) * block_skew +
2430
719k
                     smooth_factor * (blob->bounding_box().bottom() - dest_row->initial_min_y());
2431
719k
      }
2432
930k
    }
2433
1.05M
  }
2434
280k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2435
244k
    if (row_it.data()->blob_list()->empty()) {
2436
668
      delete row_it.extract(); // Discard empty rows.
2437
668
    }
2438
244k
  }
2439
36.1k
}
2440
2441
/**
2442
 * @name most_overlapping_row
2443
 *
2444
 * Return the row which most overlaps the blob.
2445
 */
2446
OVERLAP_STATE most_overlapping_row( // find best row
2447
    TO_ROW_IT *row_it,              // iterator
2448
    TO_ROW *&best_row,              // output row
2449
    float top,                      // top of blob
2450
    float bottom,                   // bottom of blob
2451
    float rowsize,                  // max row size
2452
    bool testing_blob               // test stuff
2453
859k
) {
2454
859k
  OVERLAP_STATE result;          // result of tests
2455
859k
  float overlap;                 // of blob & row
2456
859k
  float bestover;                // nearest row
2457
859k
  float merge_top, merge_bottom; // size of merged row
2458
859k
  ICOORD testpt;                 // testing only
2459
859k
  TO_ROW *row;                   // current row
2460
859k
  TO_ROW *test_row;              // for multiple overlaps
2461
859k
  BLOBNBOX_IT blob_it;           // for merging rows
2462
2463
859k
  result = ASSIGN;
2464
859k
  row = row_it->data();
2465
859k
  bestover = top - bottom;
2466
859k
  if (top > row->max_y()) {
2467
89.3k
    bestover -= top - row->max_y();
2468
89.3k
  }
2469
859k
  if (bottom < row->min_y()) {
2470
    // compute overlap
2471
495k
    bestover -= row->min_y() - bottom;
2472
495k
  }
2473
859k
  if (testing_blob && textord_debug_blob) {
2474
0
    tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n", bottom, top, row->min_y(),
2475
0
            row->max_y(), rowsize, bestover);
2476
0
  }
2477
859k
  test_row = row;
2478
1.56M
  do {
2479
1.56M
    if (!row_it->at_last()) {
2480
1.47M
      row_it->forward();
2481
1.47M
      test_row = row_it->data();
2482
1.47M
      if (test_row->min_y() <= top && test_row->max_y() >= bottom) {
2483
747k
        merge_top = std::max(test_row->max_y(),row->max_y());
2484
747k
        merge_bottom = std::min(test_row->min_y(),row->min_y());
2485
747k
        if (merge_top - merge_bottom <= rowsize) {
2486
3.56k
          if (testing_blob && textord_debug_blob) {
2487
0
            tprintf("Merging rows at (%g,%g), (%g,%g)\n", row->min_y(), row->max_y(),
2488
0
                    test_row->min_y(), test_row->max_y());
2489
0
          }
2490
3.56k
          test_row->set_limits(merge_bottom, merge_top);
2491
3.56k
          blob_it.set_to_list(test_row->blob_list());
2492
3.56k
          blob_it.add_list_after(row->blob_list());
2493
3.56k
          blob_it.sort(blob_x_order);
2494
3.56k
          row_it->backward();
2495
3.56k
          delete row_it->extract();
2496
3.56k
          row_it->forward();
2497
3.56k
          bestover = -1.0f; // force replacement
2498
3.56k
        }
2499
747k
        overlap = top - bottom;
2500
747k
        if (top > test_row->max_y()) {
2501
504k
          overlap -= top - test_row->max_y();
2502
504k
        }
2503
747k
        if (bottom < test_row->min_y()) {
2504
90.1k
          overlap -= test_row->min_y() - bottom;
2505
90.1k
        }
2506
747k
        if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2507
9.37k
          result = REJECT;
2508
9.37k
        }
2509
747k
        if (overlap > bestover) {
2510
413k
          bestover = overlap; // find biggest overlap
2511
413k
          row = test_row;
2512
413k
        }
2513
747k
        if (testing_blob && textord_debug_blob) {
2514
0
          tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n", bottom, top,
2515
0
                  test_row->min_y(), test_row->max_y(), rowsize, overlap, bestover);
2516
0
        }
2517
747k
      }
2518
1.47M
    }
2519
1.56M
  } while (!row_it->at_last() && test_row->min_y() <= top && test_row->max_y() >= bottom);
2520
1.92M
  while (row_it->data() != row) {
2521
1.06M
    row_it->backward(); // make it point to row
2522
1.06M
  }
2523
                        // doesn't overlap much
2524
859k
  if (top - bottom - bestover > rowsize * textord_overlap_x &&
2525
859k
      (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x) && result == ASSIGN) {
2526
60.2k
    result = NEW_ROW; // doesn't overlap enough
2527
60.2k
  }
2528
859k
  best_row = row;
2529
859k
  return result;
2530
859k
}
2531
2532
/**
2533
 * @name blob_x_order
2534
 *
2535
 * Sort function to sort blobs in x from page left.
2536
 */
2537
int blob_x_order(      // sort function
2538
    const BLOBNBOX *blob1, // items to compare
2539
8.75M
    const BLOBNBOX *blob2) {
2540
8.75M
  if (blob1->bounding_box().left() < blob2->bounding_box().left()) {
2541
5.24M
    return -1;
2542
5.24M
  } else if (blob1->bounding_box().left() > blob2->bounding_box().left()) {
2543
2.45M
    return 1;
2544
2.45M
  } else {
2545
1.05M
    return 0;
2546
1.05M
  }
2547
8.75M
}
2548
2549
/**
2550
 * @name mark_repeated_chars
2551
 *
2552
 * Mark blobs marked with BTFT_LEADER in repeated sets using the
2553
 * repeated_set member of BLOBNBOX.
2554
 */
2555
56.2k
void mark_repeated_chars(TO_ROW *row) {
2556
56.2k
  BLOBNBOX_IT box_it(row->blob_list()); // Iterator.
2557
56.2k
  int num_repeated_sets = 0;
2558
56.2k
  if (!box_it.empty()) {
2559
545k
    do {
2560
545k
      BLOBNBOX *bblob = box_it.data();
2561
545k
      int repeat_length = 1;
2562
545k
      if (bblob->flow() == BTFT_LEADER && !bblob->joined_to_prev() && bblob->cblob() != nullptr) {
2563
0
        BLOBNBOX_IT test_it(box_it);
2564
0
        for (test_it.forward(); !test_it.at_first();) {
2565
0
          bblob = test_it.data();
2566
0
          if (bblob->flow() != BTFT_LEADER) {
2567
0
            break;
2568
0
          }
2569
0
          test_it.forward();
2570
0
          bblob = test_it.data();
2571
0
          if (bblob->joined_to_prev() || bblob->cblob() == nullptr) {
2572
0
            repeat_length = 0;
2573
0
            break;
2574
0
          }
2575
0
          ++repeat_length;
2576
0
        }
2577
0
      }
2578
545k
      if (repeat_length >= kMinLeaderCount) {
2579
0
        num_repeated_sets++;
2580
0
        for (; repeat_length > 0; box_it.forward(), --repeat_length) {
2581
0
          bblob = box_it.data();
2582
0
          bblob->set_repeated_set(num_repeated_sets);
2583
0
        }
2584
545k
      } else {
2585
545k
        bblob->set_repeated_set(0);
2586
545k
        box_it.forward();
2587
545k
      }
2588
545k
    } while (!box_it.at_first()); // until all done
2589
56.2k
  }
2590
56.2k
  row->set_num_repeated_sets(num_repeated_sets);
2591
56.2k
}
2592
2593
} // namespace tesseract