/src/tesseract/src/textord/wordseg.cpp

Source (jump to first uncovered line)
/**********************************************************************
 * File:        wordseg.cpp  (Formerly wspace.c)
 * Description: Code to segment the blobs into words.
 * Author:      Ray Smith
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#  include "config_auto.h"
#endif

#include "wordseg.h"

#include <cmath>

#include "blobbox.h"
#include "cjkpitch.h"
#include "drawtord.h"
#include "fpchop.h"
#include "makerow.h"
#include "pitsync1.h"
#include "statistc.h"
#include "textord.h"
#include "topitch.h"
#include "tovars.h"

namespace tesseract {

BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");

#define BLOCK_STATS_CLUSTERS 10

/**
 * @name make_single_word
 *
 * For each row, arrange the blobs into one word. There is no fixed
 * pitch detection.
 */

void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
  TO_ROW_IT to_row_it(rows);
  ROW_IT row_it(real_rows);
  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
    TO_ROW *row = to_row_it.data();
    // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
    // to create the word.
    C_BLOB_LIST cblobs;
    C_BLOB_IT cblob_it(&cblobs);
    BLOBNBOX_IT box_it(row->blob_list());
    for (; !box_it.empty(); box_it.forward()) {
      BLOBNBOX *bblob = box_it.extract();
      if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
        auto cblob = bblob->remove_cblob();
        if (cblob != nullptr) {
          C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
          cout_it.move_to_last();
          cout_it.add_list_after(cblob->out_list());
          delete cblob;
        }
      } else {
        auto cblob = bblob->remove_cblob();
        if (cblob != nullptr) {
          cblob_it.add_after_then_move(cblob);
        }
      }
      delete bblob;
    }
    // Convert the TO_ROW to a ROW.
    ROW *real_row =
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
    WERD_IT word_it(real_row->word_list());
    WERD *word = new WERD(&cblobs, 0, nullptr);
    word->set_flag(W_BOL, true);
    word->set_flag(W_EOL, true);
    word->set_flag(W_DONT_CHOP, one_blob);
    word_it.add_after_then_move(word);
    real_row->recalc_bounding_box();
    row_it.add_after_then_move(real_row);
  }
}

/**
 * make_words
 *
 * Arrange the blobs into words.
 */
void make_words(tesseract::Textord *textord,
                ICOORD page_tr,               // top right
                float gradient,               // page skew
                BLOCK_LIST *blocks,           // block list
                TO_BLOCK_LIST *port_blocks) { // output list
  TO_BLOCK_IT block_it;                       // iterator
  TO_BLOCK *block;                            // current block

  if (textord->use_cjk_fp_model()) {
    compute_fixed_pitch_cjk(page_tr, port_blocks);
  } else {
    compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
                        !bool(textord_test_landscape));
  }
  textord->to_spacing(page_tr, port_blocks);
  block_it.set_to_list(port_blocks);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    block = block_it.data();
    make_real_words(textord, block, FCOORD(1.0f, 0.0f));
  }
}

/**
 * @name set_row_spaces
 *
 * Set the min_space and max_nonspace members of the row so that
 * the blobs can be arranged into words.
 */

void set_row_spaces( // find space sizes
    TO_BLOCK *block, // block to do
    FCOORD rotation, // for drawing
    bool testing_on  // correct orientation
) {
  TO_ROW *row; // current row
  TO_ROW_IT row_it = block->get_rows();

  if (row_it.empty()) {
    return; // empty block
  }
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    row = row_it.data();
    if (row->fixed_pitch == 0) {
      row->min_space = static_cast<int32_t>(
          ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
      row->max_nonspace = static_cast<int32_t>(
          floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
      if (testing_on && textord_show_initial_words) {
        tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
                row->min_space, row->intercept());
      }
      row->space_threshold = (row->max_nonspace + row->min_space) / 2;
      row->space_size = row->pr_space;
      row->kern_size = row->pr_nonsp;
    }
#ifndef GRAPHICS_DISABLED
    if (textord_show_initial_words && testing_on) {
      plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
    }
#endif
  }
}

/**
 * @name row_words
 *
 * Compute the max nonspace and min space for the row.
 */

int32_t row_words(    // compute space size
    TO_BLOCK *block,  // block it came from
    TO_ROW *row,      // row to operate on
    int32_t maxwidth, // max expected space size
    FCOORD rotation,  // for drawing
    bool testing_on   // for debug
) {
  bool testing_row;      // contains testpt
  bool prev_valid;       // if decent size
  int32_t prev_x;        // end of prev blob
  int32_t cluster_count; // no of clusters
  int32_t gap_index;     // which cluster
  int32_t smooth_factor; // for smoothing stats
  BLOBNBOX *blob;        // current blob
  float lower, upper;    // clustering parameters
  float gaps[3];         // gap clusers
  ICOORD testpt;
  TBOX blob_box; // bounding box
                 // iterator
  BLOBNBOX_IT blob_it = row->blob_list();
  STATS gap_stats(0, maxwidth - 1);
  STATS cluster_stats[4]; // clusters

  testpt = ICOORD(textord_test_x, textord_test_y);
  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
  //      if (testing_on)
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
  prev_valid = false;
  prev_x = -INT32_MAX;
  testing_row = false;
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    blob = blob_it.data();
    blob_box = blob->bounding_box();
    if (blob_box.contains(testpt)) {
      testing_row = true;
    }
    gap_stats.add(blob_box.width(), 1);
  }
  gap_stats.clear();
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    blob = blob_it.data();
    if (!blob->joined_to_prev()) {
      blob_box = blob->bounding_box();
      if (prev_valid && blob_box.left() - prev_x < maxwidth) {
        gap_stats.add(blob_box.left() - prev_x, 1);
      }
      prev_valid = true;
      prev_x = blob_box.right();
    }
  }
  if (gap_stats.get_total() == 0) {
    row->min_space = 0; // no evidence
    row->max_nonspace = 0;
    return 0;
  }
  gap_stats.smooth(smooth_factor);
  lower = row->xheight * textord_words_initial_lower;
  upper = row->xheight * textord_words_initial_upper;
  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
  while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
    // shrink gap
    upper = (upper * 3 + lower) / 4;
    lower = (lower * 3 + upper) / 4;
    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
  }
  if (cluster_count < 2) {
    row->min_space = 0; // no evidence
    row->max_nonspace = 0;
    return 0;
  }
  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
  }
  // get medians
  if (cluster_count > 2) {
    if (testing_on && textord_show_initial_words) {
      tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
              cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
    }
    lower = gaps[0];
    if (gaps[1] > lower) {
      upper = gaps[1]; // prefer most frequent
      if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
        upper = gaps[2];
      }
    } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
      upper = gaps[2];
    } else if (lower >= block->xheight * textord_words_min_minspace) {
      upper = lower; // not nice
      lower = gaps[1];
      if (testing_on && textord_show_initial_words) {
        tprintf("Had to switch most common from lower to upper!!\n");
        gap_stats.print();
      }
    } else {
      row->min_space = 0; // no evidence
      row->max_nonspace = 0;
      return 0;
    }
  } else {
    if (gaps[1] < gaps[0]) {
      if (testing_on && textord_show_initial_words) {
        tprintf("Had to switch most common from lower to upper!!\n");
        gap_stats.print();
      }
      lower = gaps[1];
      upper = gaps[0];
    } else {
      upper = gaps[1];
      lower = gaps[0];
    }
  }
  if (upper < block->xheight * textord_words_min_minspace) {
    row->min_space = 0; // no evidence
    row->max_nonspace = 0;
    return 0;
  }
  if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
      lower * 3 > block->min_space * 2 + block->max_nonspace) {
    if (testing_on && textord_show_initial_words) {
      tprintf("Disagreement between block and row at %g!!\n", row->intercept());
      tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
      gap_stats.print();
    }
  }
  row->min_space =
      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
  row->max_nonspace =
      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
  row->space_size = upper;
  row->kern_size = lower;
  if (testing_on && textord_show_initial_words) {
    if (testing_row) {
      tprintf("GAP STATS\n");
      gap_stats.print();
      tprintf("SPACE stats\n");
      cluster_stats[2].print_summary();
      tprintf("NONSPACE stats\n");
      cluster_stats[1].print_summary();
    }
    tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
            upper, row->max_nonspace, lower);
  }
  return cluster_stats[2].get_total();
}

/**
 * @name row_words2
 *
 * Compute the max nonspace and min space for the row.
 */

int32_t row_words2(   // compute space size
    TO_BLOCK *block,  // block it came from
    TO_ROW *row,      // row to operate on
    int32_t maxwidth, // max expected space size
    FCOORD rotation,  // for drawing
    bool testing_on   // for debug
) {
  bool prev_valid;       // if decent size
  bool this_valid;       // current blob big enough
  int32_t prev_x;        // end of prev blob
  int32_t min_width;     // min interesting width
  int32_t valid_count;   // good gaps
  int32_t total_count;   // total gaps
  int32_t cluster_count; // no of clusters
  int32_t prev_count;    // previous cluster_count
  int32_t gap_index;     // which cluster
  int32_t smooth_factor; // for smoothing stats
  BLOBNBOX *blob;        // current blob
  float lower, upper;    // clustering parameters
  ICOORD testpt;
  TBOX blob_box; // bounding box
                 // iterator
  BLOBNBOX_IT blob_it = row->blob_list();
  STATS gap_stats(0, maxwidth - 1);
  // gap sizes
  float gaps[BLOCK_STATS_CLUSTERS];
  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
  // clusters

  testpt = ICOORD(textord_test_x, textord_test_y);
  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
  //      if (testing_on)
  //              tprintf("Row smooth factor=%d\n",smooth_factor);
  prev_valid = false;
  prev_x = -INT16_MAX;
  const bool testing_row = false;
  // min blob size
  min_width = static_cast<int32_t>(block->pr_space);
  total_count = 0;
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    blob = blob_it.data();
    if (!blob->joined_to_prev()) {
      blob_box = blob->bounding_box();
      this_valid = blob_box.width() >= min_width;
      if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
        gap_stats.add(blob_box.left() - prev_x, 1);
      }
      total_count++; // count possibles
      prev_x = blob_box.right();
      prev_valid = this_valid;
    }
  }
  valid_count = gap_stats.get_total();
  if (valid_count < total_count * textord_words_minlarge) {
    gap_stats.clear();
    prev_x = -INT16_MAX;
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      blob = blob_it.data();
      if (!blob->joined_to_prev()) {
        blob_box = blob->bounding_box();
        if (blob_box.left() - prev_x < maxwidth) {
          gap_stats.add(blob_box.left() - prev_x, 1);
        }
        prev_x = blob_box.right();
      }
    }
  }
  if (gap_stats.get_total() == 0) {
    row->min_space = 0; // no evidence
    row->max_nonspace = 0;
    return 0;
  }

  cluster_count = 0;
  lower = block->xheight * words_initial_lower;
  upper = block->xheight * words_initial_upper;
  gap_stats.smooth(smooth_factor);
  do {
    prev_count = cluster_count;
    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
                                      BLOCK_STATS_CLUSTERS, cluster_stats);
  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
  if (cluster_count < 1) {
    row->min_space = 0;
    row->max_nonspace = 0;
    return 0;
  }
  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
  }
  // get medians
  if (testing_on) {
    tprintf("cluster_count=%d:", cluster_count);
    for (gap_index = 0; gap_index < cluster_count; gap_index++) {
      tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
    }
    tprintf("\n");
  }

  // Try to find proportional non-space and space for row.
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
       gap_index++) {
    ;
  }
  if (gap_index < cluster_count) {
    lower = gaps[gap_index]; // most frequent below
  } else {
    if (testing_on) {
      tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
    }
    lower = block->pr_nonsp;
  }
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
       gap_index++) {
    ;
  }
  if (gap_index < cluster_count) {
    upper = gaps[gap_index]; // most frequent above
  } else {
    if (testing_on) {
      tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
    }
    upper = block->pr_space;
  }
  row->min_space =
      static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
  row->max_nonspace =
      static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
  row->space_size = upper;
  row->kern_size = lower;
  if (testing_on) {
    if (testing_row) {
      tprintf("GAP STATS\n");
      gap_stats.print();
      tprintf("SPACE stats\n");
      cluster_stats[2].print_summary();
      tprintf("NONSPACE stats\n");
      cluster_stats[1].print_summary();
    }
    tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
            upper, row->max_nonspace, lower);
  }
  return 1;
}

/**
 * @name make_real_words
 *
 * Convert a TO_BLOCK to a BLOCK.
 */

void make_real_words(tesseract::Textord *textord,
                     TO_BLOCK *block, // block to do
                     FCOORD rotation  // for drawing
) {
  TO_ROW *row; // current row
  TO_ROW_IT row_it = block->get_rows();
  ROW *real_row = nullptr; // output row
  ROW_IT real_row_it = block->block->row_list();

  if (row_it.empty()) {
    return; // empty block
  }
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    row = row_it.data();
    if (row->blob_list()->empty() && !row->rep_words.empty()) {
      real_row = make_rep_words(row, block);
    } else if (!row->blob_list()->empty()) {
      // In a fixed pitch document, some lines may be detected as fixed pitch
      // while others don't, and will go through different path.
      // For non-space delimited language like CJK, fixed pitch chop always
      // leave the entire line as one word.  We can force consistent chopping
      // with force_make_prop_words flag.
      POLY_BLOCK *pb = block->block->pdblk.poly_block();
      if (textord_chopper_test) {
        real_row = textord->make_blob_words(row, rotation);
      } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
                 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {
        real_row = textord->make_prop_words(row, rotation);
      } else if (row->pitch_decision == PITCH_DEF_FIXED ||
                 row->pitch_decision == PITCH_CORR_FIXED) {
        real_row = fixed_pitch_words(row, rotation);
      } else {
        ASSERT_HOST(false);
      }
    }
    if (real_row != nullptr) {
      // put row in block
      real_row_it.add_after_then_move(real_row);
    }
  }
  block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
                          static_cast<int16_t>(block->space_size),
                          static_cast<int16_t>(block->fixed_pitch));
  block->block->check_pitch();
}

/**
 * @name make_rep_words
 *
 * Fabricate a real row from only the repeated blob words.
 * Get the xheight from the block as it may be more meaningful.
 */

ROW *make_rep_words( // make a row
    TO_ROW *row,     // row to convert
    TO_BLOCK *block  // block it lives in
) {
  ROW *real_row; // output row
  TBOX word_box; // bounding box
                 // iterator
  WERD_IT word_it = &row->rep_words;

  if (word_it.empty()) {
    return nullptr;
  }
  word_box = word_it.data()->bounding_box();
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word_box += word_it.data()->bounding_box();
  }
  row->xheight = block->xheight;
  real_row =
      new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
  word_it.set_to_list(real_row->word_list());
  // put words in row
  word_it.add_list_after(&row->rep_words);
  real_row->recalc_bounding_box();
  return real_row;
}

/**
 * @name make_real_word
 *
 * Construct a WERD from a given number of adjacent entries in a
 * list of BLOBNBOXs.
 */

WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
                     int32_t blobcount,   // no of blobs to use
                     bool bol,            // start of line
                     uint8_t blanks       // no of blanks
) {
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;

  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
    auto bblob = box_it->extract();
    if (bblob->joined_to_prev()) {
      auto cblob = bblob->remove_cblob();
      if (cblob != nullptr) {
        cout_it.set_to_list(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(cblob->out_list());
        delete cblob;
      }
    } else {
      auto cblob = bblob->remove_cblob();
      if (cblob != nullptr) {
        cblob_it.add_after_then_move(cblob);
      }
    }
    delete bblob;
    box_it->forward(); // next one
  }

  if (blanks < 1) {
    blanks = 1;
  }

  auto word = new WERD(&cblobs, blanks, nullptr);

  if (bol) {
    word->set_flag(W_BOL, true);
  }
  if (box_it->at_first()) {
    word->set_flag(W_EOL, true); // at end of line
  }

  return word;
}

} // namespace tesseract

Coverage Report

Created: 2025-07-23 07:12

Line	Count	Source (jump to first uncovered line)
1		/**********************************************************************
2		* File: wordseg.cpp (Formerly wspace.c)
3		* Description: Code to segment the blobs into words.
4		* Author: Ray Smith
5		*
6		* (C) Copyright 1992, Hewlett-Packard Ltd.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		*
17		**********************************************************************/
18
19		// Include automatically generated configuration file if running autoconf.
20		#ifdef HAVE_CONFIG_H
21		# include "config_auto.h"
22		#endif
23
24		#include "wordseg.h"
25
26		#include <cmath>
27
28		#include "blobbox.h"
29		#include "cjkpitch.h"
30		#include "drawtord.h"
31		#include "fpchop.h"
32		#include "makerow.h"
33		#include "pitsync1.h"
34		#include "statistc.h"
35		#include "textord.h"
36		#include "topitch.h"
37		#include "tovars.h"
38
39		namespace tesseract {
40
41		BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42		BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43
44	0	#define BLOCK_STATS_CLUSTERS 10
45
46		/**
47		* @name make_single_word
48		*
49		* For each row, arrange the blobs into one word. There is no fixed
50		* pitch detection.
51		*/
52
53	0	void make_single_word(bool one_blob, TO_ROW_LIST rows, ROW_LIST real_rows) {
54	0	TO_ROW_IT to_row_it(rows);
55	0	ROW_IT row_it(real_rows);
56	0	for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57	0	TO_ROW *row = to_row_it.data();
58		// The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59		// to create the word.
60	0	C_BLOB_LIST cblobs;
61	0	C_BLOB_IT cblob_it(&cblobs);
62	0	BLOBNBOX_IT box_it(row->blob_list());
63	0	for (; !box_it.empty(); box_it.forward()) {
64	0	BLOBNBOX *bblob = box_it.extract();
65	0	if (bblob->joined_to_prev() \|\| (one_blob && !cblob_it.empty())) {
66	0	auto cblob = bblob->remove_cblob();
67	0	if (cblob != nullptr) {
68	0	C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69	0	cout_it.move_to_last();
70	0	cout_it.add_list_after(cblob->out_list());
71	0	delete cblob;
72	0	}
73	0	} else {
74	0	auto cblob = bblob->remove_cblob();
75	0	if (cblob != nullptr) {
76	0	cblob_it.add_after_then_move(cblob);
77	0	}
78	0	}
79	0	delete bblob;
80	0	}
81		// Convert the TO_ROW to a ROW.
82	0	ROW *real_row =
83	0	new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84	0	WERD_IT word_it(real_row->word_list());
85	0	WERD *word = new WERD(&cblobs, 0, nullptr);
86	0	word->set_flag(W_BOL, true);
87	0	word->set_flag(W_EOL, true);
88	0	word->set_flag(W_DONT_CHOP, one_blob);
89	0	word_it.add_after_then_move(word);
90	0	real_row->recalc_bounding_box();
91	0	row_it.add_after_then_move(real_row);
92	0	}
93	0	}
94
95		/**
96		* make_words
97		*
98		* Arrange the blobs into words.
99		*/
100		void make_words(tesseract::Textord *textord,
101		ICOORD page_tr, // top right
102		float gradient, // page skew
103		BLOCK_LIST *blocks, // block list
104	17.2k	TO_BLOCK_LIST *port_blocks) { // output list
105	17.2k	TO_BLOCK_IT block_it; // iterator
106	17.2k	TO_BLOCK *block; // current block
107
108	17.2k	if (textord->use_cjk_fp_model()) {
109	0	compute_fixed_pitch_cjk(page_tr, port_blocks);
110	17.2k	} else {
111	17.2k	compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
112	17.2k	!bool(textord_test_landscape));
113	17.2k	}
114	17.2k	textord->to_spacing(page_tr, port_blocks);
115	17.2k	block_it.set_to_list(port_blocks);
116	34.5k	for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
117	17.2k	block = block_it.data();
118	17.2k	make_real_words(textord, block, FCOORD(1.0f, 0.0f));
119	17.2k	}
120	17.2k	}
121
122		/**
123		* @name set_row_spaces
124		*
125		* Set the min_space and max_nonspace members of the row so that
126		* the blobs can be arranged into words.
127		*/
128
129		void set_row_spaces( // find space sizes
130		TO_BLOCK *block, // block to do
131		FCOORD rotation, // for drawing
132		bool testing_on // correct orientation
133	0	) {
134	0	TO_ROW *row; // current row
135	0	TO_ROW_IT row_it = block->get_rows();
136
137	0	if (row_it.empty()) {
138	0	return; // empty block
139	0	}
140	0	for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
141	0	row = row_it.data();
142	0	if (row->fixed_pitch == 0) {
143	0	row->min_space = static_cast<int32_t>(
144	0	ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
145	0	row->max_nonspace = static_cast<int32_t>(
146	0	floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
147	0	if (testing_on && textord_show_initial_words) {
148	0	tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
149	0	row->min_space, row->intercept());
150	0	}
151	0	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
152	0	row->space_size = row->pr_space;
153	0	row->kern_size = row->pr_nonsp;
154	0	}
155		#ifndef GRAPHICS_DISABLED
156		if (textord_show_initial_words && testing_on) {
157		plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
158		}
159		#endif
160	0	}
161	0	}
162
163		/**
164		* @name row_words
165		*
166		* Compute the max nonspace and min space for the row.
167		*/
168
169		int32_t row_words( // compute space size
170		TO_BLOCK *block, // block it came from
171		TO_ROW *row, // row to operate on
172		int32_t maxwidth, // max expected space size
173		FCOORD rotation, // for drawing
174		bool testing_on // for debug
175	0	) {
176	0	bool testing_row; // contains testpt
177	0	bool prev_valid; // if decent size
178	0	int32_t prev_x; // end of prev blob
179	0	int32_t cluster_count; // no of clusters
180	0	int32_t gap_index; // which cluster
181	0	int32_t smooth_factor; // for smoothing stats
182	0	BLOBNBOX *blob; // current blob
183	0	float lower, upper; // clustering parameters
184	0	float gaps[3]; // gap clusers
185	0	ICOORD testpt;
186	0	TBOX blob_box; // bounding box
187		// iterator
188	0	BLOBNBOX_IT blob_it = row->blob_list();
189	0	STATS gap_stats(0, maxwidth - 1);
190	0	STATS cluster_stats[4]; // clusters
191
192	0	testpt = ICOORD(textord_test_x, textord_test_y);
193	0	smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
194		// if (testing_on)
195		// tprintf("Row smooth factor=%d\n",smooth_factor);
196	0	prev_valid = false;
197	0	prev_x = -INT32_MAX;
198	0	testing_row = false;
199	0	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
200	0	blob = blob_it.data();
201	0	blob_box = blob->bounding_box();
202	0	if (blob_box.contains(testpt)) {
203	0	testing_row = true;
204	0	}
205	0	gap_stats.add(blob_box.width(), 1);
206	0	}
207	0	gap_stats.clear();
208	0	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
209	0	blob = blob_it.data();
210	0	if (!blob->joined_to_prev()) {
211	0	blob_box = blob->bounding_box();
212	0	if (prev_valid && blob_box.left() - prev_x < maxwidth) {
213	0	gap_stats.add(blob_box.left() - prev_x, 1);
214	0	}
215	0	prev_valid = true;
216	0	prev_x = blob_box.right();
217	0	}
218	0	}
219	0	if (gap_stats.get_total() == 0) {
220	0	row->min_space = 0; // no evidence
221	0	row->max_nonspace = 0;
222	0	return 0;
223	0	}
224	0	gap_stats.smooth(smooth_factor);
225	0	lower = row->xheight * textord_words_initial_lower;
226	0	upper = row->xheight * textord_words_initial_upper;
227	0	cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
228	0	while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
229		// shrink gap
230	0	upper = (upper * 3 + lower) / 4;
231	0	lower = (lower * 3 + upper) / 4;
232	0	cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
233	0	}
234	0	if (cluster_count < 2) {
235	0	row->min_space = 0; // no evidence
236	0	row->max_nonspace = 0;
237	0	return 0;
238	0	}
239	0	for (gap_index = 0; gap_index < cluster_count; gap_index++) {
240	0	gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
241	0	}
242		// get medians
243	0	if (cluster_count > 2) {
244	0	if (testing_on && textord_show_initial_words) {
245	0	tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
246	0	cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
247	0	}
248	0	lower = gaps[0];
249	0	if (gaps[1] > lower) {
250	0	upper = gaps[1]; // prefer most frequent
251	0	if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
252	0	upper = gaps[2];
253	0	}
254	0	} else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
255	0	upper = gaps[2];
256	0	} else if (lower >= block->xheight * textord_words_min_minspace) {
257	0	upper = lower; // not nice
258	0	lower = gaps[1];
259	0	if (testing_on && textord_show_initial_words) {
260	0	tprintf("Had to switch most common from lower to upper!!\n");
261	0	gap_stats.print();
262	0	}
263	0	} else {
264	0	row->min_space = 0; // no evidence
265	0	row->max_nonspace = 0;
266	0	return 0;
267	0	}
268	0	} else {
269	0	if (gaps[1] < gaps[0]) {
270	0	if (testing_on && textord_show_initial_words) {
271	0	tprintf("Had to switch most common from lower to upper!!\n");
272	0	gap_stats.print();
273	0	}
274	0	lower = gaps[1];
275	0	upper = gaps[0];
276	0	} else {
277	0	upper = gaps[1];
278	0	lower = gaps[0];
279	0	}
280	0	}
281	0	if (upper < block->xheight * textord_words_min_minspace) {
282	0	row->min_space = 0; // no evidence
283	0	row->max_nonspace = 0;
284	0	return 0;
285	0	}
286	0	if (upper * 3 < block->min_space * 2 + block->max_nonspace \|\|
287	0	lower * 3 > block->min_space * 2 + block->max_nonspace) {
288	0	if (testing_on && textord_show_initial_words) {
289	0	tprintf("Disagreement between block and row at %g!!\n", row->intercept());
290	0	tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
291	0	gap_stats.print();
292	0	}
293	0	}
294	0	row->min_space =
295	0	static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
296	0	row->max_nonspace =
297	0	static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
298	0	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
299	0	row->space_size = upper;
300	0	row->kern_size = lower;
301	0	if (testing_on && textord_show_initial_words) {
302	0	if (testing_row) {
303	0	tprintf("GAP STATS\n");
304	0	gap_stats.print();
305	0	tprintf("SPACE stats\n");
306	0	cluster_stats[2].print_summary();
307	0	tprintf("NONSPACE stats\n");
308	0	cluster_stats[1].print_summary();
309	0	}
310	0	tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
311	0	upper, row->max_nonspace, lower);
312	0	}
313	0	return cluster_stats[2].get_total();
314	0	}
315
316		/**
317		* @name row_words2
318		*
319		* Compute the max nonspace and min space for the row.
320		*/
321
322		int32_t row_words2( // compute space size
323		TO_BLOCK *block, // block it came from
324		TO_ROW *row, // row to operate on
325		int32_t maxwidth, // max expected space size
326		FCOORD rotation, // for drawing
327		bool testing_on // for debug
328	0	) {
329	0	bool prev_valid; // if decent size
330	0	bool this_valid; // current blob big enough
331	0	int32_t prev_x; // end of prev blob
332	0	int32_t min_width; // min interesting width
333	0	int32_t valid_count; // good gaps
334	0	int32_t total_count; // total gaps
335	0	int32_t cluster_count; // no of clusters
336	0	int32_t prev_count; // previous cluster_count
337	0	int32_t gap_index; // which cluster
338	0	int32_t smooth_factor; // for smoothing stats
339	0	BLOBNBOX *blob; // current blob
340	0	float lower, upper; // clustering parameters
341	0	ICOORD testpt;
342	0	TBOX blob_box; // bounding box
343		// iterator
344	0	BLOBNBOX_IT blob_it = row->blob_list();
345	0	STATS gap_stats(0, maxwidth - 1);
346		// gap sizes
347	0	float gaps[BLOCK_STATS_CLUSTERS];
348	0	STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
349		// clusters
350
351	0	testpt = ICOORD(textord_test_x, textord_test_y);
352	0	smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
353		// if (testing_on)
354		// tprintf("Row smooth factor=%d\n",smooth_factor);
355	0	prev_valid = false;
356	0	prev_x = -INT16_MAX;
357	0	const bool testing_row = false;
358		// min blob size
359	0	min_width = static_cast<int32_t>(block->pr_space);
360	0	total_count = 0;
361	0	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
362	0	blob = blob_it.data();
363	0	if (!blob->joined_to_prev()) {
364	0	blob_box = blob->bounding_box();
365	0	this_valid = blob_box.width() >= min_width;
366	0	if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
367	0	gap_stats.add(blob_box.left() - prev_x, 1);
368	0	}
369	0	total_count++; // count possibles
370	0	prev_x = blob_box.right();
371	0	prev_valid = this_valid;
372	0	}
373	0	}
374	0	valid_count = gap_stats.get_total();
375	0	if (valid_count < total_count * textord_words_minlarge) {
376	0	gap_stats.clear();
377	0	prev_x = -INT16_MAX;
378	0	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
379	0	blob = blob_it.data();
380	0	if (!blob->joined_to_prev()) {
381	0	blob_box = blob->bounding_box();
382	0	if (blob_box.left() - prev_x < maxwidth) {
383	0	gap_stats.add(blob_box.left() - prev_x, 1);
384	0	}
385	0	prev_x = blob_box.right();
386	0	}
387	0	}
388	0	}
389	0	if (gap_stats.get_total() == 0) {
390	0	row->min_space = 0; // no evidence
391	0	row->max_nonspace = 0;
392	0	return 0;
393	0	}
394
395	0	cluster_count = 0;
396	0	lower = block->xheight * words_initial_lower;
397	0	upper = block->xheight * words_initial_upper;
398	0	gap_stats.smooth(smooth_factor);
399	0	do {
400	0	prev_count = cluster_count;
401	0	cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
402	0	BLOCK_STATS_CLUSTERS, cluster_stats);
403	0	} while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
404	0	if (cluster_count < 1) {
405	0	row->min_space = 0;
406	0	row->max_nonspace = 0;
407	0	return 0;
408	0	}
409	0	for (gap_index = 0; gap_index < cluster_count; gap_index++) {
410	0	gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
411	0	}
412		// get medians
413	0	if (testing_on) {
414	0	tprintf("cluster_count=%d:", cluster_count);
415	0	for (gap_index = 0; gap_index < cluster_count; gap_index++) {
416	0	tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
417	0	}
418	0	tprintf("\n");
419	0	}
420
421		// Try to find proportional non-space and space for row.
422	0	for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
423	0	gap_index++) {
424	0	;
425	0	}
426	0	if (gap_index < cluster_count) {
427	0	lower = gaps[gap_index]; // most frequent below
428	0	} else {
429	0	if (testing_on) {
430	0	tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
431	0	}
432	0	lower = block->pr_nonsp;
433	0	}
434	0	for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
435	0	gap_index++) {
436	0	;
437	0	}
438	0	if (gap_index < cluster_count) {
439	0	upper = gaps[gap_index]; // most frequent above
440	0	} else {
441	0	if (testing_on) {
442	0	tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
443	0	}
444	0	upper = block->pr_space;
445	0	}
446	0	row->min_space =
447	0	static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
448	0	row->max_nonspace =
449	0	static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
450	0	row->space_threshold = (row->max_nonspace + row->min_space) / 2;
451	0	row->space_size = upper;
452	0	row->kern_size = lower;
453	0	if (testing_on) {
454	0	if (testing_row) {
455	0	tprintf("GAP STATS\n");
456	0	gap_stats.print();
457	0	tprintf("SPACE stats\n");
458	0	cluster_stats[2].print_summary();
459	0	tprintf("NONSPACE stats\n");
460	0	cluster_stats[1].print_summary();
461	0	}
462	0	tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
463	0	upper, row->max_nonspace, lower);
464	0	}
465	0	return 1;
466	0	}
467
468		/**
469		* @name make_real_words
470		*
471		* Convert a TO_BLOCK to a BLOCK.
472		*/
473
474		void make_real_words(tesseract::Textord *textord,
475		TO_BLOCK *block, // block to do
476		FCOORD rotation // for drawing
477	17.2k	) {
478	17.2k	TO_ROW *row; // current row
479	17.2k	TO_ROW_IT row_it = block->get_rows();
480	17.2k	ROW *real_row = nullptr; // output row
481	17.2k	ROW_IT real_row_it = block->block->row_list();
482
483	17.2k	if (row_it.empty()) {
484	519	return; // empty block
485	519	}
486	202k	for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
487	185k	row = row_it.data();
488	185k	if (row->blob_list()->empty() && !row->rep_words.empty()) {
489	0	real_row = make_rep_words(row, block);
490	185k	} else if (!row->blob_list()->empty()) {
491		// In a fixed pitch document, some lines may be detected as fixed pitch
492		// while others don't, and will go through different path.
493		// For non-space delimited language like CJK, fixed pitch chop always
494		// leave the entire line as one word. We can force consistent chopping
495		// with force_make_prop_words flag.
496	185k	POLY_BLOCK *pb = block->block->pdblk.poly_block();
497	185k	if (textord_chopper_test) {
498	0	real_row = textord->make_blob_words(row, rotation);
499	185k	} else if (textord_force_make_prop_words \|\| (pb != nullptr && !pb->IsText()) \|\|
500	185k	row->pitch_decision == PITCH_DEF_PROP \|\| row->pitch_decision == PITCH_CORR_PROP) {
501	180k	real_row = textord->make_prop_words(row, rotation);
502	180k	} else if (row->pitch_decision == PITCH_DEF_FIXED \|\|
503	5.56k	row->pitch_decision == PITCH_CORR_FIXED) {
504	5.56k	real_row = fixed_pitch_words(row, rotation);
505	5.56k	} else {
506	0	ASSERT_HOST(false);
507	0	}
508	185k	}
509	185k	if (real_row != nullptr) {
510		// put row in block
511	185k	real_row_it.add_after_then_move(real_row);
512	185k	}
513	185k	}
514	16.7k	block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
515	16.7k	static_cast<int16_t>(block->space_size),
516	16.7k	static_cast<int16_t>(block->fixed_pitch));
517	16.7k	block->block->check_pitch();
518	16.7k	}
519
520		/**
521		* @name make_rep_words
522		*
523		* Fabricate a real row from only the repeated blob words.
524		* Get the xheight from the block as it may be more meaningful.
525		*/
526
527		ROW *make_rep_words( // make a row
528		TO_ROW *row, // row to convert
529		TO_BLOCK *block // block it lives in
530	0	) {
531	0	ROW *real_row; // output row
532	0	TBOX word_box; // bounding box
533		// iterator
534	0	WERD_IT word_it = &row->rep_words;
535
536	0	if (word_it.empty()) {
537	0	return nullptr;
538	0	}
539	0	word_box = word_it.data()->bounding_box();
540	0	for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
541	0	word_box += word_it.data()->bounding_box();
542	0	}
543	0	row->xheight = block->xheight;
544	0	real_row =
545	0	new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
546	0	word_it.set_to_list(real_row->word_list());
547		// put words in row
548	0	word_it.add_list_after(&row->rep_words);
549	0	real_row->recalc_bounding_box();
550	0	return real_row;
551	0	}
552
553		/**
554		* @name make_real_word
555		*
556		* Construct a WERD from a given number of adjacent entries in a
557		* list of BLOBNBOXs.
558		*/
559
560		WERD make_real_word(BLOBNBOX_IT box_it, // iterator
561		int32_t blobcount, // no of blobs to use
562		bool bol, // start of line
563		uint8_t blanks // no of blanks
564	0	) {
565	0	C_OUTLINE_IT cout_it;
566	0	C_BLOB_LIST cblobs;
567	0	C_BLOB_IT cblob_it = &cblobs;
568
569	0	for (int blobindex = 0; blobindex < blobcount; blobindex++) {
570	0	auto bblob = box_it->extract();
571	0	if (bblob->joined_to_prev()) {
572	0	auto cblob = bblob->remove_cblob();
573	0	if (cblob != nullptr) {
574	0	cout_it.set_to_list(cblob_it.data()->out_list());
575	0	cout_it.move_to_last();
576	0	cout_it.add_list_after(cblob->out_list());
577	0	delete cblob;
578	0	}
579	0	} else {
580	0	auto cblob = bblob->remove_cblob();
581	0	if (cblob != nullptr) {
582	0	cblob_it.add_after_then_move(cblob);
583	0	}
584	0	}
585	0	delete bblob;
586	0	box_it->forward(); // next one
587	0	}
588
589	0	if (blanks < 1) {
590	0	blanks = 1;
591	0	}
592
593	0	auto word = new WERD(&cblobs, blanks, nullptr);
594
595	0	if (bol) {
596	0	word->set_flag(W_BOL, true);
597	0	}
598	0	if (box_it->at_first()) {
599	0	word->set_flag(W_EOL, true); // at end of line
600	0	}
601
602	0	return word;
603	0	}
604
605		} // namespace tesseract