/src/tesseract/src/ccmain/linerec.cpp

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        linerec.cpp
// Description: Top-level line-based recognition module for Tesseract.
// Author:      Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#include "tesseractclass.h"

#include <allheaders.h>
#include "boxread.h"
#include "imagedata.h" // for ImageData
#include "lstmrecognizer.h"
#include "pageres.h"
#include "recodebeam.h"
#include "tprintf.h"

#include <algorithm>

namespace tesseract {

// Scale factor to make certainty more comparable to Tesseract.
const float kCertaintyScale = 7.0f;
// Worst acceptable certainty for a dictionary word.
const float kWorstDictCertainty = -25.0f;

// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
// Return true if successful, false if an error occurred.
bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
                                    BLOCK_LIST *block_list) {
  std::string lstmf_name = output_basename + ".lstmf";
  DocumentData images(lstmf_name);
  if (applybox_page > 0) {
    // Load existing document for the previous pages.
    if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
      tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
      return false;
    }
  }
  std::vector<TBOX> boxes;
  std::vector<std::string> texts;
  // Get the boxes for this page, if there are any.
  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
      boxes.empty()) {
    tprintf("Failed to read boxes from %s\n", input_imagename);
    return false;
  }
  TrainFromBoxes(boxes, texts, block_list, &images);
  if (images.PagesSize() == 0) {
    tprintf("Failed to read pages from %s\n", input_imagename);
    return false;
  }
  images.Shuffle();
  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
    tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
    return false;
  }
  return true;
}

// Generates training data for training a line recognizer, eg LSTM.
// Breaks the boxes into lines, normalizes them, converts to ImageData and
// appends them to the given training_data.
void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
                               BLOCK_LIST *block_list, DocumentData *training_data) {
  auto box_count = boxes.size();
  // Process all the text lines in this page, as defined by the boxes.
  unsigned end_box = 0;
  // Don't let \t, which marks newlines in the box file, get into the line
  // content, as that makes the line unusable in training.
  while (end_box < texts.size() && texts[end_box] == "\t") {
    ++end_box;
  }
  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
    // Find the textline of boxes starting at start and their bounding box.
    TBOX line_box = boxes[start_box];
    std::string line_str = texts[start_box];
    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
      line_box += boxes[end_box];
      line_str += texts[end_box];
    }
    // Find the most overlapping block.
    BLOCK *best_block = nullptr;
    int best_overlap = 0;
    BLOCK_IT b_it(block_list);
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
      BLOCK *block = b_it.data();
      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
        continue; // Not a text block.
      }
      TBOX block_box = block->pdblk.bounding_box();
      block_box.rotate(block->re_rotation());
      if (block_box.major_overlap(line_box)) {
        TBOX overlap_box = line_box.intersection(block_box);
        if (overlap_box.area() > best_overlap) {
          best_overlap = overlap_box.area();
          best_block = block;
        }
      }
    }
    ImageData *imagedata = nullptr;
    if (best_block == nullptr) {
      tprintf("No block overlapping textline: %s\n", line_str.c_str());
    } else {
      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
    }
    if (imagedata != nullptr) {
      training_data->AddPageToDocument(imagedata);
    }
    // Don't let \t, which marks newlines in the box file, get into the line
    // content, as that makes the line unusable in training.
    while (end_box < texts.size() && texts[end_box] == "\t") {
      ++end_box;
    }
  }
}

// Returns an Imagedata containing the image of the given box,
// and ground truth boxes/truth text if available in the input.
// The image is not normalized in any way.
ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
                                  const std::vector<std::string> &texts, int start_box, int end_box,
                                  const BLOCK &block) {
  TBOX revised_box;
  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
  if (image_data == nullptr) {
    return nullptr;
  }
  image_data->set_page_number(applybox_page);
  // Copy the boxes and shift them so they are relative to the image.
  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
  ICOORD shift = -revised_box.botleft();
  std::vector<TBOX> line_boxes;
  std::vector<std::string> line_texts;
  for (int b = start_box; b < end_box; ++b) {
    TBOX box = boxes[b];
    box.rotate(block_rotation);
    box.move(shift);
    line_boxes.push_back(box);
    line_texts.push_back(texts[b]);
  }
  std::vector<int> page_numbers(line_boxes.size(), applybox_page);
  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
  return image_data;
}

// Helper gets the image of a rectangle, using the block.re_rotation() if
// needed to get to the image, and rotating the result back to horizontal
// layout. (CJK characters will be on their left sides) The vertical text flag
// is set in the returned ImageData if the text was originally vertical, which
// can be used to invoke a different CJK recognition engine. The revised_box
// is also returned to enable calculation of output bounding boxes.
ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
                                   TBOX *revised_box) const {
  TBOX wbox = box;
  wbox.pad(padding, padding);
  *revised_box = wbox;
  // Number of clockwise 90 degree rotations needed to get back to tesseract
  // coords from the clipped image.
  int num_rotations = 0;
  if (block.re_rotation().y() > 0.0f) {
    num_rotations = 1;
  } else if (block.re_rotation().x() < 0.0f) {
    num_rotations = 2;
  } else if (block.re_rotation().y() < 0.0f) {
    num_rotations = 3;
  }
  // Handle two cases automatically: 1 the box came from the block, 2 the box
  // came from a box file, and refers to the image, which the block may not.
  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
    revised_box->rotate(block.re_rotation());
  }
  // Now revised_box always refers to the image.
  // BestPix is never colormapped, but may be of any depth.
  Image pix = BestPix();
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  TBOX image_box(0, 0, width, height);
  // Clip to image bounds;
  *revised_box &= image_box;
  if (revised_box->null_box()) {
    return nullptr;
  }
  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
                            revised_box->height());
  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
  boxDestroy(&clip_box);
  if (box_pix == nullptr) {
    return nullptr;
  }
  if (num_rotations > 0) {
    Image rot_pix = pixRotateOrth(box_pix, num_rotations);
    box_pix.destroy();
    box_pix = rot_pix;
  }
  // Convert sub-8-bit images to 8 bit.
  int depth = pixGetDepth(box_pix);
  if (depth < 8) {
    Image grey;
    grey = pixConvertTo8(box_pix, false);
    box_pix.destroy();
    box_pix = grey;
  }
  bool vertical_text = false;
  if (num_rotations > 0) {
    // Rotated the clipped revised box back to internal coordinates.
    FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
    revised_box->rotate(rotation);
    if (num_rotations != 2) {
      vertical_text = true;
    }
  }
  return new ImageData(vertical_text, box_pix);
}

// Recognizes a word or group of words, converting to WERD_RES in *words.
// Analogous to classify_word_pass1, but can handle a group of words as well.
void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
                                  PointerVector<WERD_RES> *words) {
  TBOX word_box = word->word->bounding_box();
  // Get the word image - no frills.
  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
    // In single word mode, use the whole image without any other row/word
    // interpretation.
    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
  } else {
    float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
    if (baseline + row->descenders() < word_box.bottom()) {
      word_box.set_bottom(baseline + row->descenders());
    }
    if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
      word_box.set_top(baseline + row->x_height() + row->ascenders());
    }
  }
  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
  if (im_data == nullptr) {
    return;
  }

  bool do_invert = tessedit_do_invert;
  float threshold = do_invert ? double(invert_threshold) : 0.0f;
  lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
                                  kWorstDictCertainty / kCertaintyScale, word_box, words,
                                  lstm_choice_mode, lstm_choice_iterations);
  delete im_data;
  SearchWords(words);
}

// Apply segmentation search to the given set of words, within the constraints
// of the existing ratings matrix. If there is already a best_choice on a word
// leaves it untouched and just sets the done/accepted etc flags.
void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
  // Run the segmentation search on the network outputs and make a BoxWord
  // for each of the output words.
  // If we drop a word as junk, then there is always a space in front of the
  // next.
  const Dict *stopper_dict = lstm_recognizer_->GetDict();
  if (stopper_dict == nullptr) {
    stopper_dict = &getDict();
  }
  for (unsigned w = 0; w < words->size(); ++w) {
    WERD_RES *word = (*words)[w];
    if (word->best_choice == nullptr) {
      // It is a dud.
      word->SetupFake(lstm_recognizer_->GetUnicharset());
    } else {
      // Set the best state.
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
        int length = word->best_choice->state(i);
        word->best_state.push_back(length);
      }
      word->reject_map.initialise(word->best_choice->length());
      word->tess_failed = false;
      word->tess_accepted = true;
      word->tess_would_adapt = false;
      word->done = true;
      word->tesseract = this;
      float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
      word_certainty *= kCertaintyScale;
      if (getDict().stopper_debug_level >= 1) {
        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
                word->best_choice->certainty(), word->space_certainty,
                std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
                word_certainty);
        word->best_choice->print();
      }
      word->best_choice->set_certainty(word_certainty);

      word->tess_accepted = stopper_dict->AcceptableResult(word);
    }
  }
}

} // namespace tesseract.

Coverage Report

Created: 2024-02-28 06:46

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: linerec.cpp
3		// Description: Top-level line-based recognition module for Tesseract.
4		// Author: Ray Smith
5		//
6		// (C) Copyright 2013, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		///////////////////////////////////////////////////////////////////////
17
18		#include "tesseractclass.h"
19
20		#include <allheaders.h>
21		#include "boxread.h"
22		#include "imagedata.h" // for ImageData
23		#include "lstmrecognizer.h"
24		#include "pageres.h"
25		#include "recodebeam.h"
26		#include "tprintf.h"
27
28		#include <algorithm>
29
30		namespace tesseract {
31
32		// Scale factor to make certainty more comparable to Tesseract.
33		const float kCertaintyScale = 7.0f;
34		// Worst acceptable certainty for a dictionary word.
35		const float kWorstDictCertainty = -25.0f;
36
37		// Generates training data for training a line recognizer, eg LSTM.
38		// Breaks the page into lines, according to the boxes, and writes them to a
39		// serialized DocumentData based on output_basename.
40		// Return true if successful, false if an error occurred.
41		bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
42	0	BLOCK_LIST *block_list) {
43	0	std::string lstmf_name = output_basename + ".lstmf";
44	0	DocumentData images(lstmf_name);
45	0	if (applybox_page > 0) {
46		// Load existing document for the previous pages.
47	0	if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
48	0	tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
49	0	return false;
50	0	}
51	0	}
52	0	std::vector<TBOX> boxes;
53	0	std::vector<std::string> texts;
54		// Get the boxes for this page, if there are any.
55	0	if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) \|\|
56	0	boxes.empty()) {
57	0	tprintf("Failed to read boxes from %s\n", input_imagename);
58	0	return false;
59	0	}
60	0	TrainFromBoxes(boxes, texts, block_list, &images);
61	0	if (images.PagesSize() == 0) {
62	0	tprintf("Failed to read pages from %s\n", input_imagename);
63	0	return false;
64	0	}
65	0	images.Shuffle();
66	0	if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
67	0	tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
68	0	return false;
69	0	}
70	0	return true;
71	0	}
72
73		// Generates training data for training a line recognizer, eg LSTM.
74		// Breaks the boxes into lines, normalizes them, converts to ImageData and
75		// appends them to the given training_data.
76		void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
77	0	BLOCK_LIST block_list, DocumentData training_data) {
78	0	auto box_count = boxes.size();
79		// Process all the text lines in this page, as defined by the boxes.
80	0	unsigned end_box = 0;
81		// Don't let \t, which marks newlines in the box file, get into the line
82		// content, as that makes the line unusable in training.
83	0	while (end_box < texts.size() && texts[end_box] == "\t") {
84	0	++end_box;
85	0	}
86	0	for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
87		// Find the textline of boxes starting at start and their bounding box.
88	0	TBOX line_box = boxes[start_box];
89	0	std::string line_str = texts[start_box];
90	0	for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
91	0	line_box += boxes[end_box];
92	0	line_str += texts[end_box];
93	0	}
94		// Find the most overlapping block.
95	0	BLOCK *best_block = nullptr;
96	0	int best_overlap = 0;
97	0	BLOCK_IT b_it(block_list);
98	0	for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
99	0	BLOCK *block = b_it.data();
100	0	if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
101	0	continue; // Not a text block.
102	0	}
103	0	TBOX block_box = block->pdblk.bounding_box();
104	0	block_box.rotate(block->re_rotation());
105	0	if (block_box.major_overlap(line_box)) {
106	0	TBOX overlap_box = line_box.intersection(block_box);
107	0	if (overlap_box.area() > best_overlap) {
108	0	best_overlap = overlap_box.area();
109	0	best_block = block;
110	0	}
111	0	}
112	0	}
113	0	ImageData *imagedata = nullptr;
114	0	if (best_block == nullptr) {
115	0	tprintf("No block overlapping textline: %s\n", line_str.c_str());
116	0	} else {
117	0	imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
118	0	}
119	0	if (imagedata != nullptr) {
120	0	training_data->AddPageToDocument(imagedata);
121	0	}
122		// Don't let \t, which marks newlines in the box file, get into the line
123		// content, as that makes the line unusable in training.
124	0	while (end_box < texts.size() && texts[end_box] == "\t") {
125	0	++end_box;
126	0	}
127	0	}
128	0	}
129
130		// Returns an Imagedata containing the image of the given box,
131		// and ground truth boxes/truth text if available in the input.
132		// The image is not normalized in any way.
133		ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
134		const std::vector<std::string> &texts, int start_box, int end_box,
135	0	const BLOCK &block) {
136	0	TBOX revised_box;
137	0	ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
138	0	if (image_data == nullptr) {
139	0	return nullptr;
140	0	}
141	0	image_data->set_page_number(applybox_page);
142		// Copy the boxes and shift them so they are relative to the image.
143	0	FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
144	0	ICOORD shift = -revised_box.botleft();
145	0	std::vector<TBOX> line_boxes;
146	0	std::vector<std::string> line_texts;
147	0	for (int b = start_box; b < end_box; ++b) {
148	0	TBOX box = boxes[b];
149	0	box.rotate(block_rotation);
150	0	box.move(shift);
151	0	line_boxes.push_back(box);
152	0	line_texts.push_back(texts[b]);
153	0	}
154	0	std::vector<int> page_numbers(line_boxes.size(), applybox_page);
155	0	image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156	0	return image_data;
157	0	}
158
159		// Helper gets the image of a rectangle, using the block.re_rotation() if
160		// needed to get to the image, and rotating the result back to horizontal
161		// layout. (CJK characters will be on their left sides) The vertical text flag
162		// is set in the returned ImageData if the text was originally vertical, which
163		// can be used to invoke a different CJK recognition engine. The revised_box
164		// is also returned to enable calculation of output bounding boxes.
165		ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
166	180k	TBOX *revised_box) const {
167	180k	TBOX wbox = box;
168	180k	wbox.pad(padding, padding);
169	180k	*revised_box = wbox;
170		// Number of clockwise 90 degree rotations needed to get back to tesseract
171		// coords from the clipped image.
172	180k	int num_rotations = 0;
173	180k	if (block.re_rotation().y() > 0.0f) {
174	0	num_rotations = 1;
175	180k	} else if (block.re_rotation().x() < 0.0f) {
176	0	num_rotations = 2;
177	180k	} else if (block.re_rotation().y() < 0.0f) {
178	0	num_rotations = 3;
179	0	}
180		// Handle two cases automatically: 1 the box came from the block, 2 the box
181		// came from a box file, and refers to the image, which the block may not.
182	180k	if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
183	177k	revised_box->rotate(block.re_rotation());
184	177k	}
185		// Now revised_box always refers to the image.
186		// BestPix is never colormapped, but may be of any depth.
187	180k	Image pix = BestPix();
188	180k	int width = pixGetWidth(pix);
189	180k	int height = pixGetHeight(pix);
190	180k	TBOX image_box(0, 0, width, height);
191		// Clip to image bounds;
192	180k	*revised_box &= image_box;
193	180k	if (revised_box->null_box()) {
194	0	return nullptr;
195	0	}
196	180k	Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
197	180k	revised_box->height());
198	180k	Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
199	180k	boxDestroy(&clip_box);
200	180k	if (box_pix == nullptr) {
201	0	return nullptr;
202	0	}
203	180k	if (num_rotations > 0) {
204	0	Image rot_pix = pixRotateOrth(box_pix, num_rotations);
205	0	box_pix.destroy();
206	0	box_pix = rot_pix;
207	0	}
208		// Convert sub-8-bit images to 8 bit.
209	180k	int depth = pixGetDepth(box_pix);
210	180k	if (depth < 8) {
211	180k	Image grey;
212	180k	grey = pixConvertTo8(box_pix, false);
213	180k	box_pix.destroy();
214	180k	box_pix = grey;
215	180k	}
216	180k	bool vertical_text = false;
217	180k	if (num_rotations > 0) {
218		// Rotated the clipped revised box back to internal coordinates.
219	0	FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
220	0	revised_box->rotate(rotation);
221	0	if (num_rotations != 2) {
222	0	vertical_text = true;
223	0	}
224	0	}
225	180k	return new ImageData(vertical_text, box_pix);
226	180k	}
227
228		// Recognizes a word or group of words, converting to WERD_RES in *words.
229		// Analogous to classify_word_pass1, but can handle a group of words as well.
230		void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW row, WERD_RES word,
231	180k	PointerVector<WERD_RES> *words) {
232	180k	TBOX word_box = word->word->bounding_box();
233		// Get the word image - no frills.
234	180k	if (tessedit_pageseg_mode == PSM_SINGLE_WORD \|\| tessedit_pageseg_mode == PSM_RAW_LINE) {
235		// In single word mode, use the whole image without any other row/word
236		// interpretation.
237	0	word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238	180k	} else {
239	180k	float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240	180k	if (baseline + row->descenders() < word_box.bottom()) {
241	149k	word_box.set_bottom(baseline + row->descenders());
242	149k	}
243	180k	if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
244	143k	word_box.set_top(baseline + row->x_height() + row->ascenders());
245	143k	}
246	180k	}
247	180k	ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
248	180k	if (im_data == nullptr) {
249	0	return;
250	0	}
251
252	180k	bool do_invert = tessedit_do_invert;
253	180k	float threshold = do_invert ? double(invert_threshold) : 0.0f;
254	180k	lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
255	180k	kWorstDictCertainty / kCertaintyScale, word_box, words,
256	180k	lstm_choice_mode, lstm_choice_iterations);
257	180k	delete im_data;
258	180k	SearchWords(words);
259	180k	}
260
261		// Apply segmentation search to the given set of words, within the constraints
262		// of the existing ratings matrix. If there is already a best_choice on a word
263		// leaves it untouched and just sets the done/accepted etc flags.
264	180k	void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
265		// Run the segmentation search on the network outputs and make a BoxWord
266		// for each of the output words.
267		// If we drop a word as junk, then there is always a space in front of the
268		// next.
269	180k	const Dict *stopper_dict = lstm_recognizer_->GetDict();
270	180k	if (stopper_dict == nullptr) {
271	0	stopper_dict = &getDict();
272	0	}
273	343k	for (unsigned w = 0; w < words->size(); ++w) {
274	163k	WERD_RES word = (words)[w];
275	163k	if (word->best_choice == nullptr) {
276		// It is a dud.
277	0	word->SetupFake(lstm_recognizer_->GetUnicharset());
278	163k	} else {
279		// Set the best state.
280	384k	for (unsigned i = 0; i < word->best_choice->length(); ++i) {
281	221k	int length = word->best_choice->state(i);
282	221k	word->best_state.push_back(length);
283	221k	}
284	163k	word->reject_map.initialise(word->best_choice->length());
285	163k	word->tess_failed = false;
286	163k	word->tess_accepted = true;
287	163k	word->tess_would_adapt = false;
288	163k	word->done = true;
289	163k	word->tesseract = this;
290	163k	float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
291	163k	word_certainty *= kCertaintyScale;
292	163k	if (getDict().stopper_debug_level >= 1) {
293	0	tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
294	0	word->best_choice->certainty(), word->space_certainty,
295	0	std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
296	0	word_certainty);
297	0	word->best_choice->print();
298	0	}
299	163k	word->best_choice->set_certainty(word_certainty);
300
301	163k	word->tess_accepted = stopper_dict->AcceptableResult(word);
302	163k	}
303	163k	}
304	180k	}
305
306		} // namespace tesseract.