Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/textord/wordseg.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        wordseg.cpp  (Formerly wspace.c)
3
 * Description: Code to segment the blobs into words.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "wordseg.h"
25
26
#include <cmath>
27
28
#include "blobbox.h"
29
#include "cjkpitch.h"
30
#include "drawtord.h"
31
#include "fpchop.h"
32
#include "makerow.h"
33
#include "pitsync1.h"
34
#include "statistc.h"
35
#include "textord.h"
36
#include "topitch.h"
37
#include "tovars.h"
38
39
namespace tesseract {
40
41
BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42
BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43
44
0
#define BLOCK_STATS_CLUSTERS 10
45
46
/**
47
 * @name make_single_word
48
 *
49
 * For each row, arrange the blobs into one word. There is no fixed
50
 * pitch detection.
51
 */
52
53
0
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54
0
  TO_ROW_IT to_row_it(rows);
55
0
  ROW_IT row_it(real_rows);
56
0
  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57
0
    TO_ROW *row = to_row_it.data();
58
    // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59
    // to create the word.
60
0
    C_BLOB_LIST cblobs;
61
0
    C_BLOB_IT cblob_it(&cblobs);
62
0
    BLOBNBOX_IT box_it(row->blob_list());
63
0
    for (; !box_it.empty(); box_it.forward()) {
64
0
      BLOBNBOX *bblob = box_it.extract();
65
0
      if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66
0
        auto cblob = bblob->remove_cblob();
67
0
        if (cblob != nullptr) {
68
0
          C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69
0
          cout_it.move_to_last();
70
0
          cout_it.add_list_after(cblob->out_list());
71
0
          delete cblob;
72
0
        }
73
0
      } else {
74
0
        auto cblob = bblob->remove_cblob();
75
0
        if (cblob != nullptr) {
76
0
          cblob_it.add_after_then_move(cblob);
77
0
        }
78
0
      }
79
0
      delete bblob;
80
0
    }
81
    // Convert the TO_ROW to a ROW.
82
0
    ROW *real_row =
83
0
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84
0
    WERD_IT word_it(real_row->word_list());
85
0
    WERD *word = new WERD(&cblobs, 0, nullptr);
86
0
    word->set_flag(W_BOL, true);
87
0
    word->set_flag(W_EOL, true);
88
0
    word->set_flag(W_DONT_CHOP, one_blob);
89
0
    word_it.add_after_then_move(word);
90
0
    real_row->recalc_bounding_box();
91
0
    row_it.add_after_then_move(real_row);
92
0
  }
93
0
}
94
95
/**
96
 * make_words
97
 *
98
 * Arrange the blobs into words.
99
 */
100
void make_words(tesseract::Textord *textord,
101
                ICOORD page_tr,               // top right
102
                float gradient,               // page skew
103
                BLOCK_LIST *blocks,           // block list
104
17.2k
                TO_BLOCK_LIST *port_blocks) { // output list
105
17.2k
  TO_BLOCK_IT block_it;                       // iterator
106
17.2k
  TO_BLOCK *block;                            // current block
107
108
17.2k
  if (textord->use_cjk_fp_model()) {
109
0
    compute_fixed_pitch_cjk(page_tr, port_blocks);
110
17.2k
  } else {
111
17.2k
    compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
112
17.2k
                        !bool(textord_test_landscape));
113
17.2k
  }
114
17.2k
  textord->to_spacing(page_tr, port_blocks);
115
17.2k
  block_it.set_to_list(port_blocks);
116
34.5k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
117
17.2k
    block = block_it.data();
118
17.2k
    make_real_words(textord, block, FCOORD(1.0f, 0.0f));
119
17.2k
  }
120
17.2k
}
121
122
/**
123
 * @name set_row_spaces
124
 *
125
 * Set the min_space and max_nonspace members of the row so that
126
 * the blobs can be arranged into words.
127
 */
128
129
void set_row_spaces( // find space sizes
130
    TO_BLOCK *block, // block to do
131
    FCOORD rotation, // for drawing
132
    bool testing_on  // correct orientation
133
0
) {
134
0
  TO_ROW *row; // current row
135
0
  TO_ROW_IT row_it = block->get_rows();
136
137
0
  if (row_it.empty()) {
138
0
    return; // empty block
139
0
  }
140
0
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
141
0
    row = row_it.data();
142
0
    if (row->fixed_pitch == 0) {
143
0
      row->min_space = static_cast<int32_t>(
144
0
          ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
145
0
      row->max_nonspace = static_cast<int32_t>(
146
0
          floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
147
0
      if (testing_on && textord_show_initial_words) {
148
0
        tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
149
0
                row->min_space, row->intercept());
150
0
      }
151
0
      row->space_threshold = (row->max_nonspace + row->min_space) / 2;
152
0
      row->space_size = row->pr_space;
153
0
      row->kern_size = row->pr_nonsp;
154
0
    }
155
#ifndef GRAPHICS_DISABLED
156
    if (textord_show_initial_words && testing_on) {
157
      plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
158
    }
159
#endif
160
0
  }
161
0
}
162
163
/**
164
 * @name row_words
165
 *
166
 * Compute the max nonspace and min space for the row.
167
 */
168
169
int32_t row_words(    // compute space size
170
    TO_BLOCK *block,  // block it came from
171
    TO_ROW *row,      // row to operate on
172
    int32_t maxwidth, // max expected space size
173
    FCOORD rotation,  // for drawing
174
    bool testing_on   // for debug
175
0
) {
176
0
  bool testing_row;      // contains testpt
177
0
  bool prev_valid;       // if decent size
178
0
  int32_t prev_x;        // end of prev blob
179
0
  int32_t cluster_count; // no of clusters
180
0
  int32_t gap_index;     // which cluster
181
0
  int32_t smooth_factor; // for smoothing stats
182
0
  BLOBNBOX *blob;        // current blob
183
0
  float lower, upper;    // clustering parameters
184
0
  float gaps[3];         // gap clusers
185
0
  ICOORD testpt;
186
0
  TBOX blob_box; // bounding box
187
                 // iterator
188
0
  BLOBNBOX_IT blob_it = row->blob_list();
189
0
  STATS gap_stats(0, maxwidth - 1);
190
0
  STATS cluster_stats[4]; // clusters
191
192
0
  testpt = ICOORD(textord_test_x, textord_test_y);
193
0
  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
194
  //      if (testing_on)
195
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
196
0
  prev_valid = false;
197
0
  prev_x = -INT32_MAX;
198
0
  testing_row = false;
199
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
200
0
    blob = blob_it.data();
201
0
    blob_box = blob->bounding_box();
202
0
    if (blob_box.contains(testpt)) {
203
0
      testing_row = true;
204
0
    }
205
0
    gap_stats.add(blob_box.width(), 1);
206
0
  }
207
0
  gap_stats.clear();
208
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
209
0
    blob = blob_it.data();
210
0
    if (!blob->joined_to_prev()) {
211
0
      blob_box = blob->bounding_box();
212
0
      if (prev_valid && blob_box.left() - prev_x < maxwidth) {
213
0
        gap_stats.add(blob_box.left() - prev_x, 1);
214
0
      }
215
0
      prev_valid = true;
216
0
      prev_x = blob_box.right();
217
0
    }
218
0
  }
219
0
  if (gap_stats.get_total() == 0) {
220
0
    row->min_space = 0; // no evidence
221
0
    row->max_nonspace = 0;
222
0
    return 0;
223
0
  }
224
0
  gap_stats.smooth(smooth_factor);
225
0
  lower = row->xheight * textord_words_initial_lower;
226
0
  upper = row->xheight * textord_words_initial_upper;
227
0
  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
228
0
  while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
229
    // shrink gap
230
0
    upper = (upper * 3 + lower) / 4;
231
0
    lower = (lower * 3 + upper) / 4;
232
0
    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
233
0
  }
234
0
  if (cluster_count < 2) {
235
0
    row->min_space = 0; // no evidence
236
0
    row->max_nonspace = 0;
237
0
    return 0;
238
0
  }
239
0
  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
240
0
    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
241
0
  }
242
  // get medians
243
0
  if (cluster_count > 2) {
244
0
    if (testing_on && textord_show_initial_words) {
245
0
      tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
246
0
              cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
247
0
    }
248
0
    lower = gaps[0];
249
0
    if (gaps[1] > lower) {
250
0
      upper = gaps[1]; // prefer most frequent
251
0
      if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
252
0
        upper = gaps[2];
253
0
      }
254
0
    } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
255
0
      upper = gaps[2];
256
0
    } else if (lower >= block->xheight * textord_words_min_minspace) {
257
0
      upper = lower; // not nice
258
0
      lower = gaps[1];
259
0
      if (testing_on && textord_show_initial_words) {
260
0
        tprintf("Had to switch most common from lower to upper!!\n");
261
0
        gap_stats.print();
262
0
      }
263
0
    } else {
264
0
      row->min_space = 0; // no evidence
265
0
      row->max_nonspace = 0;
266
0
      return 0;
267
0
    }
268
0
  } else {
269
0
    if (gaps[1] < gaps[0]) {
270
0
      if (testing_on && textord_show_initial_words) {
271
0
        tprintf("Had to switch most common from lower to upper!!\n");
272
0
        gap_stats.print();
273
0
      }
274
0
      lower = gaps[1];
275
0
      upper = gaps[0];
276
0
    } else {
277
0
      upper = gaps[1];
278
0
      lower = gaps[0];
279
0
    }
280
0
  }
281
0
  if (upper < block->xheight * textord_words_min_minspace) {
282
0
    row->min_space = 0; // no evidence
283
0
    row->max_nonspace = 0;
284
0
    return 0;
285
0
  }
286
0
  if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
287
0
      lower * 3 > block->min_space * 2 + block->max_nonspace) {
288
0
    if (testing_on && textord_show_initial_words) {
289
0
      tprintf("Disagreement between block and row at %g!!\n", row->intercept());
290
0
      tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
291
0
      gap_stats.print();
292
0
    }
293
0
  }
294
0
  row->min_space =
295
0
      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
296
0
  row->max_nonspace =
297
0
      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
298
0
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
299
0
  row->space_size = upper;
300
0
  row->kern_size = lower;
301
0
  if (testing_on && textord_show_initial_words) {
302
0
    if (testing_row) {
303
0
      tprintf("GAP STATS\n");
304
0
      gap_stats.print();
305
0
      tprintf("SPACE stats\n");
306
0
      cluster_stats[2].print_summary();
307
0
      tprintf("NONSPACE stats\n");
308
0
      cluster_stats[1].print_summary();
309
0
    }
310
0
    tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
311
0
            upper, row->max_nonspace, lower);
312
0
  }
313
0
  return cluster_stats[2].get_total();
314
0
}
315
316
/**
317
 * @name row_words2
318
 *
319
 * Compute the max nonspace and min space for the row.
320
 */
321
322
int32_t row_words2(   // compute space size
323
    TO_BLOCK *block,  // block it came from
324
    TO_ROW *row,      // row to operate on
325
    int32_t maxwidth, // max expected space size
326
    FCOORD rotation,  // for drawing
327
    bool testing_on   // for debug
328
0
) {
329
0
  bool prev_valid;       // if decent size
330
0
  bool this_valid;       // current blob big enough
331
0
  int32_t prev_x;        // end of prev blob
332
0
  int32_t min_width;     // min interesting width
333
0
  int32_t valid_count;   // good gaps
334
0
  int32_t total_count;   // total gaps
335
0
  int32_t cluster_count; // no of clusters
336
0
  int32_t prev_count;    // previous cluster_count
337
0
  int32_t gap_index;     // which cluster
338
0
  int32_t smooth_factor; // for smoothing stats
339
0
  BLOBNBOX *blob;        // current blob
340
0
  float lower, upper;    // clustering parameters
341
0
  ICOORD testpt;
342
0
  TBOX blob_box; // bounding box
343
                 // iterator
344
0
  BLOBNBOX_IT blob_it = row->blob_list();
345
0
  STATS gap_stats(0, maxwidth - 1);
346
  // gap sizes
347
0
  float gaps[BLOCK_STATS_CLUSTERS];
348
0
  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
349
  // clusters
350
351
0
  testpt = ICOORD(textord_test_x, textord_test_y);
352
0
  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
353
  //      if (testing_on)
354
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
355
0
  prev_valid = false;
356
0
  prev_x = -INT16_MAX;
357
0
  const bool testing_row = false;
358
  // min blob size
359
0
  min_width = static_cast<int32_t>(block->pr_space);
360
0
  total_count = 0;
361
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
362
0
    blob = blob_it.data();
363
0
    if (!blob->joined_to_prev()) {
364
0
      blob_box = blob->bounding_box();
365
0
      this_valid = blob_box.width() >= min_width;
366
0
      if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
367
0
        gap_stats.add(blob_box.left() - prev_x, 1);
368
0
      }
369
0
      total_count++; // count possibles
370
0
      prev_x = blob_box.right();
371
0
      prev_valid = this_valid;
372
0
    }
373
0
  }
374
0
  valid_count = gap_stats.get_total();
375
0
  if (valid_count < total_count * textord_words_minlarge) {
376
0
    gap_stats.clear();
377
0
    prev_x = -INT16_MAX;
378
0
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
379
0
      blob = blob_it.data();
380
0
      if (!blob->joined_to_prev()) {
381
0
        blob_box = blob->bounding_box();
382
0
        if (blob_box.left() - prev_x < maxwidth) {
383
0
          gap_stats.add(blob_box.left() - prev_x, 1);
384
0
        }
385
0
        prev_x = blob_box.right();
386
0
      }
387
0
    }
388
0
  }
389
0
  if (gap_stats.get_total() == 0) {
390
0
    row->min_space = 0; // no evidence
391
0
    row->max_nonspace = 0;
392
0
    return 0;
393
0
  }
394
395
0
  cluster_count = 0;
396
0
  lower = block->xheight * words_initial_lower;
397
0
  upper = block->xheight * words_initial_upper;
398
0
  gap_stats.smooth(smooth_factor);
399
0
  do {
400
0
    prev_count = cluster_count;
401
0
    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
402
0
                                      BLOCK_STATS_CLUSTERS, cluster_stats);
403
0
  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
404
0
  if (cluster_count < 1) {
405
0
    row->min_space = 0;
406
0
    row->max_nonspace = 0;
407
0
    return 0;
408
0
  }
409
0
  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
410
0
    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
411
0
  }
412
  // get medians
413
0
  if (testing_on) {
414
0
    tprintf("cluster_count=%d:", cluster_count);
415
0
    for (gap_index = 0; gap_index < cluster_count; gap_index++) {
416
0
      tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
417
0
    }
418
0
    tprintf("\n");
419
0
  }
420
421
  // Try to find proportional non-space and space for row.
422
0
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
423
0
       gap_index++) {
424
0
    ;
425
0
  }
426
0
  if (gap_index < cluster_count) {
427
0
    lower = gaps[gap_index]; // most frequent below
428
0
  } else {
429
0
    if (testing_on) {
430
0
      tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
431
0
    }
432
0
    lower = block->pr_nonsp;
433
0
  }
434
0
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
435
0
       gap_index++) {
436
0
    ;
437
0
  }
438
0
  if (gap_index < cluster_count) {
439
0
    upper = gaps[gap_index]; // most frequent above
440
0
  } else {
441
0
    if (testing_on) {
442
0
      tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
443
0
    }
444
0
    upper = block->pr_space;
445
0
  }
446
0
  row->min_space =
447
0
      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
448
0
  row->max_nonspace =
449
0
      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
450
0
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
451
0
  row->space_size = upper;
452
0
  row->kern_size = lower;
453
0
  if (testing_on) {
454
0
    if (testing_row) {
455
0
      tprintf("GAP STATS\n");
456
0
      gap_stats.print();
457
0
      tprintf("SPACE stats\n");
458
0
      cluster_stats[2].print_summary();
459
0
      tprintf("NONSPACE stats\n");
460
0
      cluster_stats[1].print_summary();
461
0
    }
462
0
    tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
463
0
            upper, row->max_nonspace, lower);
464
0
  }
465
0
  return 1;
466
0
}
467
468
/**
469
 * @name make_real_words
470
 *
471
 * Convert a TO_BLOCK to a BLOCK.
472
 */
473
474
void make_real_words(tesseract::Textord *textord,
475
                     TO_BLOCK *block, // block to do
476
                     FCOORD rotation  // for drawing
477
17.2k
) {
478
17.2k
  TO_ROW *row; // current row
479
17.2k
  TO_ROW_IT row_it = block->get_rows();
480
17.2k
  ROW *real_row = nullptr; // output row
481
17.2k
  ROW_IT real_row_it = block->block->row_list();
482
483
17.2k
  if (row_it.empty()) {
484
519
    return; // empty block
485
519
  }
486
202k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
487
185k
    row = row_it.data();
488
185k
    if (row->blob_list()->empty() && !row->rep_words.empty()) {
489
0
      real_row = make_rep_words(row, block);
490
185k
    } else if (!row->blob_list()->empty()) {
491
      // In a fixed pitch document, some lines may be detected as fixed pitch
492
      // while others don't, and will go through different path.
493
      // For non-space delimited language like CJK, fixed pitch chop always
494
      // leave the entire line as one word.  We can force consistent chopping
495
      // with force_make_prop_words flag.
496
185k
      POLY_BLOCK *pb = block->block->pdblk.poly_block();
497
185k
      if (textord_chopper_test) {
498
0
        real_row = textord->make_blob_words(row, rotation);
499
185k
      } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
500
185k
                 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {
501
180k
        real_row = textord->make_prop_words(row, rotation);
502
180k
      } else if (row->pitch_decision == PITCH_DEF_FIXED ||
503
5.56k
                 row->pitch_decision == PITCH_CORR_FIXED) {
504
5.56k
        real_row = fixed_pitch_words(row, rotation);
505
5.56k
      } else {
506
0
        ASSERT_HOST(false);
507
0
      }
508
185k
    }
509
185k
    if (real_row != nullptr) {
510
      // put row in block
511
185k
      real_row_it.add_after_then_move(real_row);
512
185k
    }
513
185k
  }
514
16.7k
  block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
515
16.7k
                          static_cast<int16_t>(block->space_size),
516
16.7k
                          static_cast<int16_t>(block->fixed_pitch));
517
16.7k
  block->block->check_pitch();
518
16.7k
}
519
520
/**
521
 * @name make_rep_words
522
 *
523
 * Fabricate a real row from only the repeated blob words.
524
 * Get the xheight from the block as it may be more meaningful.
525
 */
526
527
ROW *make_rep_words( // make a row
528
    TO_ROW *row,     // row to convert
529
    TO_BLOCK *block  // block it lives in
530
0
) {
531
0
  ROW *real_row; // output row
532
0
  TBOX word_box; // bounding box
533
                 // iterator
534
0
  WERD_IT word_it = &row->rep_words;
535
536
0
  if (word_it.empty()) {
537
0
    return nullptr;
538
0
  }
539
0
  word_box = word_it.data()->bounding_box();
540
0
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
541
0
    word_box += word_it.data()->bounding_box();
542
0
  }
543
0
  row->xheight = block->xheight;
544
0
  real_row =
545
0
      new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
546
0
  word_it.set_to_list(real_row->word_list());
547
  // put words in row
548
0
  word_it.add_list_after(&row->rep_words);
549
0
  real_row->recalc_bounding_box();
550
0
  return real_row;
551
0
}
552
553
/**
554
 * @name make_real_word
555
 *
556
 * Construct a WERD from a given number of adjacent entries in a
557
 * list of BLOBNBOXs.
558
 */
559
560
WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
561
                     int32_t blobcount,   // no of blobs to use
562
                     bool bol,            // start of line
563
                     uint8_t blanks       // no of blanks
564
0
) {
565
0
  C_OUTLINE_IT cout_it;
566
0
  C_BLOB_LIST cblobs;
567
0
  C_BLOB_IT cblob_it = &cblobs;
568
569
0
  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
570
0
    auto bblob = box_it->extract();
571
0
    if (bblob->joined_to_prev()) {
572
0
      auto cblob = bblob->remove_cblob();
573
0
      if (cblob != nullptr) {
574
0
        cout_it.set_to_list(cblob_it.data()->out_list());
575
0
        cout_it.move_to_last();
576
0
        cout_it.add_list_after(cblob->out_list());
577
0
        delete cblob;
578
0
      }
579
0
    } else {
580
0
      auto cblob = bblob->remove_cblob();
581
0
      if (cblob != nullptr) {
582
0
        cblob_it.add_after_then_move(cblob);
583
0
      }
584
0
    }
585
0
    delete bblob;
586
0
    box_it->forward(); // next one
587
0
  }
588
589
0
  if (blanks < 1) {
590
0
    blanks = 1;
591
0
  }
592
593
0
  auto word = new WERD(&cblobs, blanks, nullptr);
594
595
0
  if (bol) {
596
0
    word->set_flag(W_BOL, true);
597
0
  }
598
0
  if (box_it->at_first()) {
599
0
    word->set_flag(W_EOL, true); // at end of line
600
0
  }
601
602
0
  return word;
603
0
}
604
605
} // namespace tesseract