/src/tesseract/src/ccstruct/werd.cpp

Source
/**********************************************************************
 * File:        werd.cpp  (Formerly word.c)
 * Description: Code for the WERD class.
 * Author:      Ray Smith
 *
 * (C) Copyright 1991, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#  include "config_auto.h"
#endif

#include "werd.h"

#include "linlsq.h"

#include "helpers.h"

namespace tesseract {

#define FIRST_COLOUR ScrollView::RED       ///< first rainbow colour
#define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour
#define CHILD_COLOUR ScrollView::BROWN     ///< colour of children

/**
 * WERD::WERD
 *
 * Constructor to build a WERD from a list of C_BLOBs.
 *   blob_list     The C_BLOBs (in word order) are not copied;
 *                 we take its elements and put them in our lists.
 *   blank_count   blanks in front of the word
 *   text          correct text, outlives this WERD
 */
WERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text)
    : blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") {
  C_BLOB_IT start_it = &cblobs;
  C_BLOB_IT rej_cblob_it = &rej_cblobs;
  C_OUTLINE_IT c_outline_it;
  int16_t inverted_vote = 0;
  int16_t non_inverted_vote = 0;

  // Move blob_list's elements into cblobs.
  start_it.add_list_after(blob_list);

  /*
  Set white on black flag for the WERD, moving any duff blobs onto the
  rej_cblobs list.
  First, walk the cblobs checking the inverse flag for each outline of each
  cblob. If a cblob has inconsistent flag settings for its different
  outlines, move the blob to the reject list. Otherwise, increment the
  appropriate w-on-b or b-on-w vote for the word.

  Now set the inversion flag for the WERD by maximum vote.

  Walk the blobs again, moving any blob whose inversion flag does not agree
  with the concencus onto the reject list.
*/
  start_it.set_to_list(&cblobs);
  if (start_it.empty()) {
    return;
  }
  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
    bool reject_blob = false;
    bool blob_inverted;

    c_outline_it.set_to_list(start_it.data()->out_list());
    blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
    for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob;
         c_outline_it.forward()) {
      reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
    }
    if (reject_blob) {
      rej_cblob_it.add_after_then_move(start_it.extract());
    } else {
      if (blob_inverted) {
        inverted_vote++;
      } else {
        non_inverted_vote++;
      }
    }
  }

  flags.set(W_INVERSE, (inverted_vote > non_inverted_vote));

  start_it.set_to_list(&cblobs);
  if (start_it.empty()) {
    return;
  }
  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
    c_outline_it.set_to_list(start_it.data()->out_list());
    if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) {
      rej_cblob_it.add_after_then_move(start_it.extract());
    }
  }
}

/**
 * WERD::WERD
 *
 * Constructor to build a WERD from a list of C_BLOBs.
 * The C_BLOBs are not copied so the source list is emptied.
 */

WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order
           WERD *clone)            ///< Source of flags
    : flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) {
  C_BLOB_IT start_it = blob_list; // iterator
  C_BLOB_IT end_it = blob_list;   // another

  while (!end_it.at_last()) {
    end_it.forward(); // move to last
  }
  cblobs.assign_to_sublist(&start_it, &end_it);
  // move to our list
  blanks = clone->blanks;
  //      fprintf(stderr,"Wrong constructor!!!!\n");
}

// Construct a WERD from a single_blob and clone the flags from this.
// W_BOL and W_EOL flags are set according to the given values.
WERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) {
  C_BLOB_LIST temp_blobs;
  C_BLOB_IT temp_it(&temp_blobs);
  temp_it.add_after_then_move(blob);
  WERD *blob_word = new WERD(&temp_blobs, this);
  blob_word->set_flag(W_BOL, bol);
  blob_word->set_flag(W_EOL, eol);
  return blob_word;
}

/**
 * WERD::bounding_box
 *
 * Return the bounding box of the WERD.
 * This is quite a mess to compute!
 * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the
 * words on the row were re-sorted. The original words were built with reject
 * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the
 * blobs in a word are rejected the BB for the word is nullptr, causing the sort
 * to screw up, leading to the erroneous possibility of the first word in a
 * row being marked as FUZZY space.
 */

TBOX WERD::bounding_box() const {
  return restricted_bounding_box(true, true);
}

// Returns the bounding box including the desired combination of upper and
// lower noise/diacritic elements.
TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
  TBOX box = true_bounding_box();
  int bottom = box.bottom();
  int top = box.top();
  // This is a read-only iteration of the rejected blobs.
  C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    TBOX dot_box = it.data()->bounding_box();
    if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) {
      box += dot_box;
    }
  }
  return box;
}

// Returns the bounding box of only the good blobs.
TBOX WERD::true_bounding_box() const {
  TBOX box; // box being built
  // This is a read-only iteration of the good blobs.
  C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->bounding_box();
  }
  return box;
}

/**
 * WERD::move
 *
 * Reposition WERD by vector
 * NOTE!! REJECT CBLOBS ARE NOT MOVED
 */

void WERD::move(const ICOORD vec) {
  C_BLOB_IT cblob_it(&cblobs); // cblob iterator

  for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
    cblob_it.data()->move(vec);
  }
}

/**
 * WERD::join_on
 *
 * Join other word onto this one. Delete the old word.
 */

void WERD::join_on(WERD *other) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT src_it(&other->cblobs);
  C_BLOB_IT rej_cblob_it(&rej_cblobs);
  C_BLOB_IT src_rej_it(&other->rej_cblobs);

  while (!src_it.empty()) {
    blob_it.add_to_end(src_it.extract());
    src_it.forward();
  }
  while (!src_rej_it.empty()) {
    rej_cblob_it.add_to_end(src_rej_it.extract());
    src_rej_it.forward();
  }
}

/**
 * WERD::copy_on
 *
 * Copy blobs from other word onto this one.
 */

void WERD::copy_on(WERD *other) {
  bool reversed = other->bounding_box().left() < bounding_box().left();
  C_BLOB_IT c_blob_it(&cblobs);
  C_BLOB_LIST c_blobs;

  c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
  if (reversed) {
    c_blob_it.add_list_before(&c_blobs);
  } else {
    c_blob_it.move_to_last();
    c_blob_it.add_list_after(&c_blobs);
  }
  if (!other->rej_cblobs.empty()) {
    C_BLOB_IT rej_c_blob_it(&rej_cblobs);
    C_BLOB_LIST new_rej_c_blobs;

    new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
    if (reversed) {
      rej_c_blob_it.add_list_before(&new_rej_c_blobs);
    } else {
      rej_c_blob_it.move_to_last();
      rej_c_blob_it.add_list_after(&new_rej_c_blobs);
    }
  }
}

/**
 * WERD::print
 *
 * Display members
 */

void WERD::print() const {
  tprintf("Blanks= %d\n", blanks);
  bounding_box().print();
  tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong());
  tprintf("   W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE");
  tprintf("   W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE");
  tprintf("   W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE");
  tprintf("   W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE");
  tprintf("   W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE");
  tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE");
  tprintf("   W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE");
  tprintf("   W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE");
  tprintf("   W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE");
  tprintf("   W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE");
  tprintf("   W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE");
  tprintf("Correct= %s\n", correct.c_str());
  tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
  tprintf("Script = %d\n", script_id_);
}

/**
 * WERD::plot
 *
 * Draw the WERD in the given colour.
 */

#ifndef GRAPHICS_DISABLED
void WERD::plot(ScrollView *window, ScrollView::Color colour) {
  C_BLOB_IT it = &cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, colour, colour);
  }
  plot_rej_blobs(window);
}

// Get the next color in the (looping) rainbow.
ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
  auto next = static_cast<ScrollView::Color>(colour + 1);
  if (next >= LAST_COLOUR || next < FIRST_COLOUR) {
    next = FIRST_COLOUR;
  }
  return next;
}

/**
 * WERD::plot
 *
 * Draw the WERD in rainbow colours in window.
 */

void WERD::plot(ScrollView *window) {
  ScrollView::Color colour = FIRST_COLOUR;
  C_BLOB_IT it = &cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, colour, CHILD_COLOUR);
    colour = NextColor(colour);
  }
  plot_rej_blobs(window);
}

/**
 * WERD::plot_rej_blobs
 *
 * Draw the WERD rejected blobs in window - ALWAYS GREY
 */

void WERD::plot_rej_blobs(ScrollView *window) {
  C_BLOB_IT it = &rej_cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
  }
}
#endif // !GRAPHICS_DISABLED

/**
 * WERD::shallow_copy()
 *
 * Make a shallow copy of a word
 */

WERD *WERD::shallow_copy() {
  WERD *new_word = new WERD;

  new_word->blanks = blanks;
  new_word->flags = flags;
  new_word->correct = correct;
  return new_word;
}

/**
 * WERD::operator=
 *
 * Assign a word, DEEP copying the blob list
 */

WERD &WERD::operator=(const WERD &source) {
  this->ELIST2<WERD>::LINK::operator=(source);
  blanks = source.blanks;
  flags = source.flags;
  script_id_ = source.script_id_;
  correct = source.correct;
  cblobs.clear();
  cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
  rej_cblobs.clear();
  rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
  return *this;
}

/**
 *  word_comparator()
 *
 *  word comparator used to sort a word list so that words are in increasing
 *  order of left edge.
 */

int word_comparator(const WERD *word1, const WERD *word2) {
  return word1->bounding_box().left() - word2->bounding_box().left();
}

/**
 *  WERD::ConstructWerdWithNewBlobs()
 *
 * This method returns a new werd constructed using the blobs in the input
 * all_blobs list, which correspond to the blobs in this werd object. The
 * blobs used to construct the new word are consumed and removed from the
 * input all_blobs list.
 * Returns nullptr if the word couldn't be constructed.
 * Returns original blobs for which no matches were found in the output list
 * orphan_blobs (appends).
 */

WERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) {
  C_BLOB_LIST current_blob_list;
  C_BLOB_IT werd_blobs_it(&current_blob_list);
  // Add the word's c_blobs.
  werd_blobs_it.add_list_after(cblob_list());

  // New blob list. These contain the blobs which will form the new word.
  C_BLOB_LIST new_werd_blobs;
  C_BLOB_IT new_blobs_it(&new_werd_blobs);

  // not_found_blobs contains the list of current word's blobs for which a
  // corresponding blob wasn't found in the input all_blobs list.
  C_BLOB_LIST not_found_blobs;
  C_BLOB_IT not_found_it(&not_found_blobs);
  not_found_it.move_to_last();

  werd_blobs_it.move_to_first();
  for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) {
    C_BLOB *werd_blob = werd_blobs_it.extract();
    TBOX werd_blob_box = werd_blob->bounding_box();
    bool found = false;
    // Now find the corresponding blob for this blob in the all_blobs
    // list. For now, follow the inefficient method of pairwise
    // comparisons. Ideally, one can pre-bucket the blobs by row.
    C_BLOB_IT all_blobs_it(all_blobs);
    for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
      C_BLOB *a_blob = all_blobs_it.data();
      // Compute the overlap of the two blobs. If major, a_blob should
      // be added to the new blobs list.
      TBOX a_blob_box = a_blob->bounding_box();
      if (a_blob_box.null_box()) {
        tprintf("Bounding box couldn't be ascertained\n");
      }
      if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) {
        // Old blobs are from minimal splits, therefore are expected to be
        // bigger. The new small blobs should cover a significant portion.
        // This is it.
        all_blobs_it.extract();
        new_blobs_it.add_after_then_move(a_blob);
        found = true;
      }
    }
    if (!found) {
      not_found_it.add_after_then_move(werd_blob);
    } else {
      delete werd_blob;
    }
  }
  // Iterate over all not found blobs. Some of them may be due to
  // under-segmentation (which is OK, since the corresponding blob is already
  // in the list in that case.
  not_found_it.move_to_first();
  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
    C_BLOB *not_found = not_found_it.data();
    TBOX not_found_box = not_found->bounding_box();
    C_BLOB_IT existing_blobs_it(new_blobs_it);
    for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
         existing_blobs_it.forward()) {
      C_BLOB *a_blob = existing_blobs_it.data();
      TBOX a_blob_box = a_blob->bounding_box();
      if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) &&
          not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
        // Already taken care of.
        delete not_found_it.extract();
        break;
      }
    }
  }
  if (orphan_blobs) {
    C_BLOB_IT orphan_blobs_it(orphan_blobs);
    orphan_blobs_it.move_to_last();
    orphan_blobs_it.add_list_after(&not_found_blobs);
  }

  // New blobs are ready. Create a new werd object with these.
  WERD *new_werd = nullptr;
  if (!new_werd_blobs.empty()) {
    new_werd = new WERD(&new_werd_blobs, this);
  } else {
    // Add the blobs back to this word so that it can be reused.
    C_BLOB_IT this_list_it(cblob_list());
    this_list_it.add_list_after(&not_found_blobs);
  }
  return new_werd;
}

// Removes noise from the word by moving small outlines to the rej_cblobs
// list, based on the size_threshold.
void WERD::CleanNoise(float size_threshold) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT rej_it(&rej_cblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB *blob = blob_it.data();
    C_OUTLINE_IT ol_it(blob->out_list());
    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
      C_OUTLINE *outline = ol_it.data();
      TBOX ol_box = outline->bounding_box();
      int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
      if (ol_size < size_threshold) {
        // This outline is too small. Move it to a separate blob in the
        // reject blobs list.
        auto *rej_blob = new C_BLOB(ol_it.extract());
        rej_it.add_after_then_move(rej_blob);
      }
    }
    if (blob->out_list()->empty()) {
      delete blob_it.extract();
    }
  }
}

// Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers.
void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
  C_BLOB_IT rej_it(&rej_cblobs);
  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
    C_BLOB *blob = rej_it.extract();
    C_OUTLINE_IT ol_it(blob->out_list());
    outlines->push_back(ol_it.extract());
    delete blob;
  }
}

// Adds the selected outlines to the indicated real blobs, and puts the rest
// back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence
// vector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end.
bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
                               const std::vector<C_BLOB *> &target_blobs,
                               const std::vector<C_OUTLINE *> &outlines,
                               bool *make_next_word_fuzzy) {
  bool outline_added_to_start = false;
  if (make_next_word_fuzzy != nullptr) {
    *make_next_word_fuzzy = false;
  }
  C_BLOB_IT rej_it(&rej_cblobs);
  for (unsigned i = 0; i < outlines.size(); ++i) {
    C_OUTLINE *outline = outlines[i];
    if (outline == nullptr) {
      continue; // Already used it.
    }
    if (wanted[i]) {
      C_BLOB *target_blob = target_blobs[i];
      TBOX noise_box = outline->bounding_box();
      if (target_blob == nullptr) {
        target_blob = new C_BLOB(outline);
        // Need to find the insertion point.
        C_BLOB_IT blob_it(&cblobs);
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
          C_BLOB *blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (blob_box.left() > noise_box.left()) {
            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
              // We might want to join this word to its predecessor.
              outline_added_to_start = true;
            }
            blob_it.add_before_stay_put(target_blob);
            break;
          }
        }
        if (blob_it.cycled_list()) {
          blob_it.add_to_end(target_blob);
          if (make_next_word_fuzzy != nullptr) {
            *make_next_word_fuzzy = true;
          }
        }
        // Add all consecutive wanted, but null-blob outlines to same blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) {
          ++i;
          ol_it.add_to_end(outlines[i]);
        }
      } else {
        // Insert outline into this blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        ol_it.add_to_end(outline);
      }
    } else {
      // Put back on noise list.
      rej_it.add_to_end(new C_BLOB(outline));
    }
  }
  return outline_added_to_start;
}

} // namespace tesseract

Coverage Report

Created: 2026-04-01 07:03

Line	Count	Source
1		/**********************************************************************
2		* File: werd.cpp (Formerly word.c)
3		* Description: Code for the WERD class.
4		* Author: Ray Smith
5		*
6		* (C) Copyright 1991, Hewlett-Packard Ltd.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		*
17		**********************************************************************/
18
19		// Include automatically generated configuration file if running autoconf.
20		#ifdef HAVE_CONFIG_H
21		# include "config_auto.h"
22		#endif
23
24		#include "werd.h"
25
26		#include "linlsq.h"
27
28		#include "helpers.h"
29
30		namespace tesseract {
31
32		#define FIRST_COLOUR ScrollView::RED ///< first rainbow colour
33		#define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour
34		#define CHILD_COLOUR ScrollView::BROWN ///< colour of children
35
36		/**
37		* WERD::WERD
38		*
39		* Constructor to build a WERD from a list of C_BLOBs.
40		* blob_list The C_BLOBs (in word order) are not copied;
41		* we take its elements and put them in our lists.
42		* blank_count blanks in front of the word
43		* text correct text, outlives this WERD
44		*/
45		WERD::WERD(C_BLOB_LIST blob_list, uint8_t blank_count, const char text)
46	395k	: blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") {
47	395k	C_BLOB_IT start_it = &cblobs;
48	395k	C_BLOB_IT rej_cblob_it = &rej_cblobs;
49	395k	C_OUTLINE_IT c_outline_it;
50	395k	int16_t inverted_vote = 0;
51	395k	int16_t non_inverted_vote = 0;
52
53		// Move blob_list's elements into cblobs.
54	395k	start_it.add_list_after(blob_list);
55
56		/*
57		Set white on black flag for the WERD, moving any duff blobs onto the
58		rej_cblobs list.
59		First, walk the cblobs checking the inverse flag for each outline of each
60		cblob. If a cblob has inconsistent flag settings for its different
61		outlines, move the blob to the reject list. Otherwise, increment the
62		appropriate w-on-b or b-on-w vote for the word.
63
64		Now set the inversion flag for the WERD by maximum vote.
65
66		Walk the blobs again, moving any blob whose inversion flag does not agree
67		with the concencus onto the reject list.
68		*/
69	395k	start_it.set_to_list(&cblobs);
70	395k	if (start_it.empty()) {
71	0	return;
72	0	}
73	1.75M	for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
74	1.35M	bool reject_blob = false;
75	1.35M	bool blob_inverted;
76
77	1.35M	c_outline_it.set_to_list(start_it.data()->out_list());
78	1.35M	blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
79	3.79M	for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob;
80	2.44M	c_outline_it.forward()) {
81	2.44M	reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
82	2.44M	}
83	1.35M	if (reject_blob) {
84	11.4k	rej_cblob_it.add_after_then_move(start_it.extract());
85	1.34M	} else {
86	1.34M	if (blob_inverted) {
87	383k	inverted_vote++;
88	960k	} else {
89	960k	non_inverted_vote++;
90	960k	}
91	1.34M	}
92	1.35M	}
93
94	395k	flags.set(W_INVERSE, (inverted_vote > non_inverted_vote));
95
96	395k	start_it.set_to_list(&cblobs);
97	395k	if (start_it.empty()) {
98	1.04k	return;
99	1.04k	}
100	1.73M	for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
101	1.34M	c_outline_it.set_to_list(start_it.data()->out_list());
102	1.34M	if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) {
103	21.1k	rej_cblob_it.add_after_then_move(start_it.extract());
104	21.1k	}
105	1.34M	}
106	394k	}
107
108		/**
109		* WERD::WERD
110		*
111		* Constructor to build a WERD from a list of C_BLOBs.
112		* The C_BLOBs are not copied so the source list is emptied.
113		*/
114
115		WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order
116		WERD *clone) ///< Source of flags
117	0	: flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) {
118	0	C_BLOB_IT start_it = blob_list; // iterator
119	0	C_BLOB_IT end_it = blob_list; // another
120
121	0	while (!end_it.at_last()) {
122	0	end_it.forward(); // move to last
123	0	}
124	0	cblobs.assign_to_sublist(&start_it, &end_it);
125		// move to our list
126	0	blanks = clone->blanks;
127		// fprintf(stderr,"Wrong constructor!!!!\n");
128	0	}
129
130		// Construct a WERD from a single_blob and clone the flags from this.
131		// W_BOL and W_EOL flags are set according to the given values.
132	0	WERD WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB blob) {
133	0	C_BLOB_LIST temp_blobs;
134	0	C_BLOB_IT temp_it(&temp_blobs);
135	0	temp_it.add_after_then_move(blob);
136	0	WERD *blob_word = new WERD(&temp_blobs, this);
137	0	blob_word->set_flag(W_BOL, bol);
138	0	blob_word->set_flag(W_EOL, eol);
139	0	return blob_word;
140	0	}
141
142		/**
143		* WERD::bounding_box
144		*
145		* Return the bounding box of the WERD.
146		* This is quite a mess to compute!
147		* ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the
148		* words on the row were re-sorted. The original words were built with reject
149		* blobs included. The FUZZY SPACE flags were set accordingly. If ALL the
150		* blobs in a word are rejected the BB for the word is nullptr, causing the sort
151		* to screw up, leading to the erroneous possibility of the first word in a
152		* row being marked as FUZZY space.
153		*/
154
155	3.37M	TBOX WERD::bounding_box() const {
156	3.37M	return restricted_bounding_box(true, true);
157	3.37M	}
158
159		// Returns the bounding box including the desired combination of upper and
160		// lower noise/diacritic elements.
161	3.37M	TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
162	3.37M	TBOX box = true_bounding_box();
163	3.37M	int bottom = box.bottom();
164	3.37M	int top = box.top();
165		// This is a read-only iteration of the rejected blobs.
166	3.37M	C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs));
167	4.56M	for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
168	1.18M	TBOX dot_box = it.data()->bounding_box();
169	1.18M	if ((upper_dots \|\| dot_box.bottom() <= top) && (lower_dots \|\| dot_box.top() >= bottom)) {
170	1.18M	box += dot_box;
171	1.18M	}
172	1.18M	}
173	3.37M	return box;
174	3.37M	}
175
176		// Returns the bounding box of only the good blobs.
177	3.37M	TBOX WERD::true_bounding_box() const {
178	3.37M	TBOX box; // box being built
179		// This is a read-only iteration of the good blobs.
180	3.37M	C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs));
181	55.3M	for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
182	51.9M	box += it.data()->bounding_box();
183	51.9M	}
184	3.37M	return box;
185	3.37M	}
186
187		/**
188		* WERD::move
189		*
190		* Reposition WERD by vector
191		* NOTE!! REJECT CBLOBS ARE NOT MOVED
192		*/
193
194	0	void WERD::move(const ICOORD vec) {
195	0	C_BLOB_IT cblob_it(&cblobs); // cblob iterator
196
197	0	for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
198	0	cblob_it.data()->move(vec);
199	0	}
200	0	}
201
202		/**
203		* WERD::join_on
204		*
205		* Join other word onto this one. Delete the old word.
206		*/
207
208	0	void WERD::join_on(WERD *other) {
209	0	C_BLOB_IT blob_it(&cblobs);
210	0	C_BLOB_IT src_it(&other->cblobs);
211	0	C_BLOB_IT rej_cblob_it(&rej_cblobs);
212	0	C_BLOB_IT src_rej_it(&other->rej_cblobs);
213
214	0	while (!src_it.empty()) {
215	0	blob_it.add_to_end(src_it.extract());
216	0	src_it.forward();
217	0	}
218	0	while (!src_rej_it.empty()) {
219	0	rej_cblob_it.add_to_end(src_rej_it.extract());
220	0	src_rej_it.forward();
221	0	}
222	0	}
223
224		/**
225		* WERD::copy_on
226		*
227		* Copy blobs from other word onto this one.
228		*/
229
230	39.6k	void WERD::copy_on(WERD *other) {
231	39.6k	bool reversed = other->bounding_box().left() < bounding_box().left();
232	39.6k	C_BLOB_IT c_blob_it(&cblobs);
233	39.6k	C_BLOB_LIST c_blobs;
234
235	39.6k	c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
236	39.6k	if (reversed) {
237	0	c_blob_it.add_list_before(&c_blobs);
238	39.6k	} else {
239	39.6k	c_blob_it.move_to_last();
240	39.6k	c_blob_it.add_list_after(&c_blobs);
241	39.6k	}
242	39.6k	if (!other->rej_cblobs.empty()) {
243	3.44k	C_BLOB_IT rej_c_blob_it(&rej_cblobs);
244	3.44k	C_BLOB_LIST new_rej_c_blobs;
245
246	3.44k	new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
247	3.44k	if (reversed) {
248	0	rej_c_blob_it.add_list_before(&new_rej_c_blobs);
249	3.44k	} else {
250	3.44k	rej_c_blob_it.move_to_last();
251	3.44k	rej_c_blob_it.add_list_after(&new_rej_c_blobs);
252	3.44k	}
253	3.44k	}
254	39.6k	}
255
256		/**
257		* WERD::print
258		*
259		* Display members
260		*/
261
262	0	void WERD::print() const {
263	0	tprintf("Blanks= %d\n", blanks);
264	0	bounding_box().print();
265	0	tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong());
266	0	tprintf(" W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE");
267	0	tprintf(" W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE");
268	0	tprintf(" W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE");
269	0	tprintf(" W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE");
270	0	tprintf(" W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE");
271	0	tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE");
272	0	tprintf(" W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE");
273	0	tprintf(" W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE");
274	0	tprintf(" W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE");
275	0	tprintf(" W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE");
276	0	tprintf(" W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE");
277	0	tprintf("Correct= %s\n", correct.c_str());
278	0	tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
279	0	tprintf("Script = %d\n", script_id_);
280	0	}
281
282		/**
283		* WERD::plot
284		*
285		* Draw the WERD in the given colour.
286		*/
287
288		#ifndef GRAPHICS_DISABLED
289		void WERD::plot(ScrollView *window, ScrollView::Color colour) {
290		C_BLOB_IT it = &cblobs;
291		for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
292		it.data()->plot(window, colour, colour);
293		}
294		plot_rej_blobs(window);
295		}
296
297		// Get the next color in the (looping) rainbow.
298		ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
299		auto next = static_cast<ScrollView::Color>(colour + 1);
300		if (next >= LAST_COLOUR \|\| next < FIRST_COLOUR) {
301		next = FIRST_COLOUR;
302		}
303		return next;
304		}
305
306		/**
307		* WERD::plot
308		*
309		* Draw the WERD in rainbow colours in window.
310		*/
311
312		void WERD::plot(ScrollView *window) {
313		ScrollView::Color colour = FIRST_COLOUR;
314		C_BLOB_IT it = &cblobs;
315		for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
316		it.data()->plot(window, colour, CHILD_COLOUR);
317		colour = NextColor(colour);
318		}
319		plot_rej_blobs(window);
320		}
321
322		/**
323		* WERD::plot_rej_blobs
324		*
325		* Draw the WERD rejected blobs in window - ALWAYS GREY
326		*/
327
328		void WERD::plot_rej_blobs(ScrollView *window) {
329		C_BLOB_IT it = &rej_cblobs;
330		for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
331		it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
332		}
333		}
334		#endif // !GRAPHICS_DISABLED
335
336		/**
337		* WERD::shallow_copy()
338		*
339		* Make a shallow copy of a word
340		*/
341
342	0	WERD *WERD::shallow_copy() {
343	0	WERD *new_word = new WERD;
344
345	0	new_word->blanks = blanks;
346	0	new_word->flags = flags;
347	0	new_word->correct = correct;
348	0	return new_word;
349	0	}
350
351		/**
352		* WERD::operator=
353		*
354		* Assign a word, DEEP copying the blob list
355		*/
356
357	119k	WERD &WERD::operator=(const WERD &source) {
358	119k	this->ELIST2<WERD>::LINK::operator=(source);
359	119k	blanks = source.blanks;
360	119k	flags = source.flags;
361	119k	script_id_ = source.script_id_;
362	119k	correct = source.correct;
363	119k	cblobs.clear();
364	119k	cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
365	119k	rej_cblobs.clear();
366	119k	rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
367	119k	return *this;
368	119k	}
369
370		/**
371		* word_comparator()
372		*
373		* word comparator used to sort a word list so that words are in increasing
374		* order of left edge.
375		*/
376
377	0	int word_comparator(const WERD word1, const WERD word2) {
378	0	return word1->bounding_box().left() - word2->bounding_box().left();
379	0	}
380
381		/**
382		* WERD::ConstructWerdWithNewBlobs()
383		*
384		* This method returns a new werd constructed using the blobs in the input
385		* all_blobs list, which correspond to the blobs in this werd object. The
386		* blobs used to construct the new word are consumed and removed from the
387		* input all_blobs list.
388		* Returns nullptr if the word couldn't be constructed.
389		* Returns original blobs for which no matches were found in the output list
390		* orphan_blobs (appends).
391		*/
392
393	0	WERD WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST all_blobs, C_BLOB_LIST *orphan_blobs) {
394	0	C_BLOB_LIST current_blob_list;
395	0	C_BLOB_IT werd_blobs_it(&current_blob_list);
396		// Add the word's c_blobs.
397	0	werd_blobs_it.add_list_after(cblob_list());
398
399		// New blob list. These contain the blobs which will form the new word.
400	0	C_BLOB_LIST new_werd_blobs;
401	0	C_BLOB_IT new_blobs_it(&new_werd_blobs);
402
403		// not_found_blobs contains the list of current word's blobs for which a
404		// corresponding blob wasn't found in the input all_blobs list.
405	0	C_BLOB_LIST not_found_blobs;
406	0	C_BLOB_IT not_found_it(&not_found_blobs);
407	0	not_found_it.move_to_last();
408
409	0	werd_blobs_it.move_to_first();
410	0	for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) {
411	0	C_BLOB *werd_blob = werd_blobs_it.extract();
412	0	TBOX werd_blob_box = werd_blob->bounding_box();
413	0	bool found = false;
414		// Now find the corresponding blob for this blob in the all_blobs
415		// list. For now, follow the inefficient method of pairwise
416		// comparisons. Ideally, one can pre-bucket the blobs by row.
417	0	C_BLOB_IT all_blobs_it(all_blobs);
418	0	for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
419	0	C_BLOB *a_blob = all_blobs_it.data();
420		// Compute the overlap of the two blobs. If major, a_blob should
421		// be added to the new blobs list.
422	0	TBOX a_blob_box = a_blob->bounding_box();
423	0	if (a_blob_box.null_box()) {
424	0	tprintf("Bounding box couldn't be ascertained\n");
425	0	}
426	0	if (werd_blob_box.contains(a_blob_box) \|\| werd_blob_box.major_overlap(a_blob_box)) {
427		// Old blobs are from minimal splits, therefore are expected to be
428		// bigger. The new small blobs should cover a significant portion.
429		// This is it.
430	0	all_blobs_it.extract();
431	0	new_blobs_it.add_after_then_move(a_blob);
432	0	found = true;
433	0	}
434	0	}
435	0	if (!found) {
436	0	not_found_it.add_after_then_move(werd_blob);
437	0	} else {
438	0	delete werd_blob;
439	0	}
440	0	}
441		// Iterate over all not found blobs. Some of them may be due to
442		// under-segmentation (which is OK, since the corresponding blob is already
443		// in the list in that case.
444	0	not_found_it.move_to_first();
445	0	for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
446	0	C_BLOB *not_found = not_found_it.data();
447	0	TBOX not_found_box = not_found->bounding_box();
448	0	C_BLOB_IT existing_blobs_it(new_blobs_it);
449	0	for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
450	0	existing_blobs_it.forward()) {
451	0	C_BLOB *a_blob = existing_blobs_it.data();
452	0	TBOX a_blob_box = a_blob->bounding_box();
453	0	if ((not_found_box.major_overlap(a_blob_box) \|\| a_blob_box.major_overlap(not_found_box)) &&
454	0	not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
455		// Already taken care of.
456	0	delete not_found_it.extract();
457	0	break;
458	0	}
459	0	}
460	0	}
461	0	if (orphan_blobs) {
462	0	C_BLOB_IT orphan_blobs_it(orphan_blobs);
463	0	orphan_blobs_it.move_to_last();
464	0	orphan_blobs_it.add_list_after(&not_found_blobs);
465	0	}
466
467		// New blobs are ready. Create a new werd object with these.
468	0	WERD *new_werd = nullptr;
469	0	if (!new_werd_blobs.empty()) {
470	0	new_werd = new WERD(&new_werd_blobs, this);
471	0	} else {
472		// Add the blobs back to this word so that it can be reused.
473	0	C_BLOB_IT this_list_it(cblob_list());
474	0	this_list_it.add_list_after(&not_found_blobs);
475	0	}
476	0	return new_werd;
477	0	}
478
479		// Removes noise from the word by moving small outlines to the rej_cblobs
480		// list, based on the size_threshold.
481	3.79k	void WERD::CleanNoise(float size_threshold) {
482	3.79k	C_BLOB_IT blob_it(&cblobs);
483	3.79k	C_BLOB_IT rej_it(&rej_cblobs);
484	26.6k	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
485	22.8k	C_BLOB *blob = blob_it.data();
486	22.8k	C_OUTLINE_IT ol_it(blob->out_list());
487	135k	for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
488	112k	C_OUTLINE *outline = ol_it.data();
489	112k	TBOX ol_box = outline->bounding_box();
490	112k	int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
491	112k	if (ol_size < size_threshold) {
492		// This outline is too small. Move it to a separate blob in the
493		// reject blobs list.
494	96.5k	auto *rej_blob = new C_BLOB(ol_it.extract());
495	96.5k	rej_it.add_after_then_move(rej_blob);
496	96.5k	}
497	112k	}
498	22.8k	if (blob->out_list()->empty()) {
499	13.3k	delete blob_it.extract();
500	13.3k	}
501	22.8k	}
502	3.79k	}
503
504		// Extracts all the noise outlines and stuffs the pointers into the given
505		// vector of outlines. Afterwards, the outlines vector owns the pointers.
506	0	void WERD::GetNoiseOutlines(std::vector<C_OUTLINE > outlines) {
507	0	C_BLOB_IT rej_it(&rej_cblobs);
508	0	for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
509	0	C_BLOB *blob = rej_it.extract();
510	0	C_OUTLINE_IT ol_it(blob->out_list());
511	0	outlines->push_back(ol_it.extract());
512	0	delete blob;
513	0	}
514	0	}
515
516		// Adds the selected outlines to the indicated real blobs, and puts the rest
517		// back in rej_cblobs where they came from. Where the target_blobs entry is
518		// nullptr, a run of wanted outlines is put into a single new blob.
519		// Ownership of the outlines is transferred back to the word. (Hence
520		// vector and not PointerVector.)
521		// Returns true if any new blob was added to the start of the word, which
522		// suggests that it might need joining to the word before it, and likewise
523		// sets make_next_word_fuzzy true if any new blob was added to the end.
524		bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
525		const std::vector<C_BLOB *> &target_blobs,
526		const std::vector<C_OUTLINE *> &outlines,
527	0	bool *make_next_word_fuzzy) {
528	0	bool outline_added_to_start = false;
529	0	if (make_next_word_fuzzy != nullptr) {
530	0	*make_next_word_fuzzy = false;
531	0	}
532	0	C_BLOB_IT rej_it(&rej_cblobs);
533	0	for (unsigned i = 0; i < outlines.size(); ++i) {
534	0	C_OUTLINE *outline = outlines[i];
535	0	if (outline == nullptr) {
536	0	continue; // Already used it.
537	0	}
538	0	if (wanted[i]) {
539	0	C_BLOB *target_blob = target_blobs[i];
540	0	TBOX noise_box = outline->bounding_box();
541	0	if (target_blob == nullptr) {
542	0	target_blob = new C_BLOB(outline);
543		// Need to find the insertion point.
544	0	C_BLOB_IT blob_it(&cblobs);
545	0	for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
546	0	C_BLOB *blob = blob_it.data();
547	0	TBOX blob_box = blob->bounding_box();
548	0	if (blob_box.left() > noise_box.left()) {
549	0	if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
550		// We might want to join this word to its predecessor.
551	0	outline_added_to_start = true;
552	0	}
553	0	blob_it.add_before_stay_put(target_blob);
554	0	break;
555	0	}
556	0	}
557	0	if (blob_it.cycled_list()) {
558	0	blob_it.add_to_end(target_blob);
559	0	if (make_next_word_fuzzy != nullptr) {
560	0	*make_next_word_fuzzy = true;
561	0	}
562	0	}
563		// Add all consecutive wanted, but null-blob outlines to same blob.
564	0	C_OUTLINE_IT ol_it(target_blob->out_list());
565	0	while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) {
566	0	++i;
567	0	ol_it.add_to_end(outlines[i]);
568	0	}
569	0	} else {
570		// Insert outline into this blob.
571	0	C_OUTLINE_IT ol_it(target_blob->out_list());
572	0	ol_it.add_to_end(outline);
573	0	}
574	0	} else {
575		// Put back on noise list.
576	0	rej_it.add_to_end(new C_BLOB(outline));
577	0	}
578	0	}
579	0	return outline_added_to_start;
580	0	}
581
582		} // namespace tesseract