/src/tesseract/src/ccmain/recogtraining.cpp

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        recogtraining.cpp
// Description: Functions for ambiguity and parameter training.
// Author:      Daria Antonova
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "tesseractclass.h"

#include "boxread.h"
#include "control.h"
#include "host.h" // for NearlyEqual
#include "ratngs.h"
#ifndef DISABLED_LEGACY_ENGINE
#  include "reject.h"
#endif
#include "stopper.h"

namespace tesseract {

const int16_t kMaxBoxEdgeDiff = 2;

// Sets flags necessary for recognition in the training mode.
// Opens and returns the pointer to the output file.
FILE *Tesseract::init_recog_training(const char *filename) {
  if (tessedit_ambigs_training) {
    tessedit_tess_adaption_mode.set_value(0); // turn off adaption
    tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
    // Explore all segmentations.
    getDict().stopper_no_acceptable_choices.set_value(true);
  }

  std::string output_fname = filename;
  const char *lastdot = strrchr(output_fname.c_str(), '.');
  if (lastdot != nullptr) {
    output_fname[lastdot - output_fname.c_str()] = '\0';
  }
  output_fname += ".txt";
  FILE *output_file = fopen(output_fname.c_str(), "a+");
  if (output_file == nullptr) {
    tprintf("Error: Could not open file %s\n", output_fname.c_str());
    ASSERT_HOST(output_file);
  }
  return output_file;
}

// Copies the bounding box from page_res_it->word() to the given TBOX.
static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
    page_res_it->forward();
  }

  if (page_res_it->word() != nullptr) {
    *tbox = page_res_it->word()->word->bounding_box();

    // If tbox->left() is negative, the training image has vertical text and
    // all the coordinates of bounding boxes of page_res are rotated by 90
    // degrees in a counterclockwise direction. We need to rotate the TBOX back
    // in order to compare with the TBOXes of box files.
    if (tbox->left() < 0) {
      tbox->rotate(FCOORD(0.0, -1.0));
    }

    return true;
  } else {
    return false;
  }
}

// This function takes tif/box pair of files and runs recognition on the image,
// while making sure that the word bounds that tesseract identified roughly
// match to those specified by the input box file. For each word (ngram in a
// single bounding box from the input box file) it outputs the ocred result,
// the correct label, rating and certainty.
void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
                                         volatile ETEXT_DESC *monitor, FILE *output_file) {
  std::string box_fname = filename;
  const char *lastdot = strrchr(box_fname.c_str(), '.');
  if (lastdot != nullptr) {
    box_fname[lastdot - box_fname.c_str()] = '\0';
  }
  box_fname += ".box";
  // ReadNextBox() will close box_file
  FILE *box_file = fopen(box_fname.c_str(), "r");
  if (box_file == nullptr) {
    tprintf("Error: Could not open file %s\n", box_fname.c_str());
    ASSERT_HOST(box_file);
  }

  PAGE_RES_IT page_res_it;
  page_res_it.page_res = page_res;
  page_res_it.restart_page();
  std::string label;

  // Process all the words on this page.
  TBOX tbox; // tesseract-identified box
  TBOX bbox; // box from the box file
  bool keep_going;
  int line_number = 0;
  int examined_words = 0;
  do {
    keep_going = read_t(&page_res_it, &tbox);
    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
    // Align bottom left points of the TBOXes.
    while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
      if (bbox.bottom() < tbox.bottom()) {
        page_res_it.forward();
        keep_going = read_t(&page_res_it, &tbox);
      } else {
        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
      }
    }
    while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
      if (bbox.left() > tbox.left()) {
        page_res_it.forward();
        keep_going = read_t(&page_res_it, &tbox);
      } else {
        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
      }
    }
    // OCR the word if top right points of the TBOXes are similar.
    if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
      examined_words++;
    }
    page_res_it.forward();
  } while (keep_going);

  // Set up scripts on all of the words that did not get sent to
  // ambigs_classify_and_output.  They all should have, but if all the
  // werd_res's don't get uch_sets, tesseract will crash when you try
  // to iterate over them. :-(
  int total_words = 0;
  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
    if (page_res_it.word()) {
      if (page_res_it.word()->uch_set == nullptr) {
        page_res_it.word()->SetupFake(unicharset);
      }
      total_words++;
    }
  }
  if (examined_words < 0.85 * total_words) {
    tprintf(
        "TODO(antonova): clean up recog_training_segmented; "
        " It examined only a small fraction of the ambigs image.\n");
  }
  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
}

// Helper prints the given set of blob choices.
static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
                      const char *label, FILE *output_file) {
  float rating = 0.0f;
  float certainty = 0.0f;
  for (int i = 0; i < length; ++i) {
    const BLOB_CHOICE *blob_choice = blob_choices[i];
    fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
    rating += blob_choice->rating();
    if (certainty > blob_choice->certainty()) {
      certainty = blob_choice->certainty();
    }
  }
  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
}

// Helper recursively prints all paths through the ratings matrix, starting
// at column col.
static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
                             const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
                             const char *label, FILE *output_file) {
  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
    if (ratings.get(col, row) != NOT_CLASSIFIED) {
      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
        blob_choices[length] = bc_it.data();
        if (row + 1 < dim) {
          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
                           output_file);
        } else {
          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
        }
      }
    }
  }
}

// Runs classify_word_pass1() on the current word. Outputs Tesseract's
// raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the
// best choice.
void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
                                           FILE *output_file) {
  // Classify word.
  fflush(stdout);
  WordData word_data(*pr_it);
  SetupWordPassN(1, &word_data);
  classify_word_and_language(1, pr_it, &word_data);
  WERD_RES *werd_res = word_data.word;
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != nullptr);

  // Compute the number of unichars in the label.
  std::vector<UNICHAR_ID> encoding;
  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
    tprintf("Not outputting illegal unichar %s\n", label);
    return;
  }

  // Dump all paths through the ratings matrix (which is normally small).
  int dim = werd_res->ratings->dimension();
  const auto **blob_choices = new const BLOB_CHOICE *[dim];
  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
  delete[] blob_choices;
}

} // namespace tesseract

Coverage Report

Created: 2024-02-28 06:46

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: recogtraining.cpp
3		// Description: Functions for ambiguity and parameter training.
4		// Author: Daria Antonova
5		//
6		// (C) Copyright 2009, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		//
17		///////////////////////////////////////////////////////////////////////
18
19		#include "tesseractclass.h"
20
21		#include "boxread.h"
22		#include "control.h"
23		#include "host.h" // for NearlyEqual
24		#include "ratngs.h"
25		#ifndef DISABLED_LEGACY_ENGINE
26		# include "reject.h"
27		#endif
28		#include "stopper.h"
29
30		namespace tesseract {
31
32		const int16_t kMaxBoxEdgeDiff = 2;
33
34		// Sets flags necessary for recognition in the training mode.
35		// Opens and returns the pointer to the output file.
36	0	FILE Tesseract::init_recog_training(const char filename) {
37	0	if (tessedit_ambigs_training) {
38	0	tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39	0	tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
40		// Explore all segmentations.
41	0	getDict().stopper_no_acceptable_choices.set_value(true);
42	0	}
43
44	0	std::string output_fname = filename;
45	0	const char *lastdot = strrchr(output_fname.c_str(), '.');
46	0	if (lastdot != nullptr) {
47	0	output_fname[lastdot - output_fname.c_str()] = '\0';
48	0	}
49	0	output_fname += ".txt";
50	0	FILE *output_file = fopen(output_fname.c_str(), "a+");
51	0	if (output_file == nullptr) {
52	0	tprintf("Error: Could not open file %s\n", output_fname.c_str());
53	0	ASSERT_HOST(output_file);
54	0	}
55	0	return output_file;
56	0	}
57
58		// Copies the bounding box from page_res_it->word() to the given TBOX.
59	0	static bool read_t(PAGE_RES_IT page_res_it, TBOX tbox) {
60	0	while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
61	0	page_res_it->forward();
62	0	}
63
64	0	if (page_res_it->word() != nullptr) {
65	0	*tbox = page_res_it->word()->word->bounding_box();
66
67		// If tbox->left() is negative, the training image has vertical text and
68		// all the coordinates of bounding boxes of page_res are rotated by 90
69		// degrees in a counterclockwise direction. We need to rotate the TBOX back
70		// in order to compare with the TBOXes of box files.
71	0	if (tbox->left() < 0) {
72	0	tbox->rotate(FCOORD(0.0, -1.0));
73	0	}
74
75	0	return true;
76	0	} else {
77	0	return false;
78	0	}
79	0	}
80
81		// This function takes tif/box pair of files and runs recognition on the image,
82		// while making sure that the word bounds that tesseract identified roughly
83		// match to those specified by the input box file. For each word (ngram in a
84		// single bounding box from the input box file) it outputs the ocred result,
85		// the correct label, rating and certainty.
86		void Tesseract::recog_training_segmented(const char filename, PAGE_RES page_res,
87	0	volatile ETEXT_DESC monitor, FILE output_file) {
88	0	std::string box_fname = filename;
89	0	const char *lastdot = strrchr(box_fname.c_str(), '.');
90	0	if (lastdot != nullptr) {
91	0	box_fname[lastdot - box_fname.c_str()] = '\0';
92	0	}
93	0	box_fname += ".box";
94		// ReadNextBox() will close box_file
95	0	FILE *box_file = fopen(box_fname.c_str(), "r");
96	0	if (box_file == nullptr) {
97	0	tprintf("Error: Could not open file %s\n", box_fname.c_str());
98	0	ASSERT_HOST(box_file);
99	0	}
100
101	0	PAGE_RES_IT page_res_it;
102	0	page_res_it.page_res = page_res;
103	0	page_res_it.restart_page();
104	0	std::string label;
105
106		// Process all the words on this page.
107	0	TBOX tbox; // tesseract-identified box
108	0	TBOX bbox; // box from the box file
109	0	bool keep_going;
110	0	int line_number = 0;
111	0	int examined_words = 0;
112	0	do {
113	0	keep_going = read_t(&page_res_it, &tbox);
114	0	keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
115		// Align bottom left points of the TBOXes.
116	0	while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117	0	if (bbox.bottom() < tbox.bottom()) {
118	0	page_res_it.forward();
119	0	keep_going = read_t(&page_res_it, &tbox);
120	0	} else {
121	0	keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
122	0	}
123	0	}
124	0	while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125	0	if (bbox.left() > tbox.left()) {
126	0	page_res_it.forward();
127	0	keep_going = read_t(&page_res_it, &tbox);
128	0	} else {
129	0	keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
130	0	}
131	0	}
132		// OCR the word if top right points of the TBOXes are similar.
133	0	if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
134	0	NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
135	0	ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
136	0	examined_words++;
137	0	}
138	0	page_res_it.forward();
139	0	} while (keep_going);
140
141		// Set up scripts on all of the words that did not get sent to
142		// ambigs_classify_and_output. They all should have, but if all the
143		// werd_res's don't get uch_sets, tesseract will crash when you try
144		// to iterate over them. :-(
145	0	int total_words = 0;
146	0	for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
147	0	if (page_res_it.word()) {
148	0	if (page_res_it.word()->uch_set == nullptr) {
149	0	page_res_it.word()->SetupFake(unicharset);
150	0	}
151	0	total_words++;
152	0	}
153	0	}
154	0	if (examined_words < 0.85 * total_words) {
155	0	tprintf(
156	0	"TODO(antonova): clean up recog_training_segmented; "
157	0	" It examined only a small fraction of the ambigs image.\n");
158	0	}
159	0	tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
160	0	}
161
162		// Helper prints the given set of blob choices.
163		static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
164	0	const char label, FILE output_file) {
165	0	float rating = 0.0f;
166	0	float certainty = 0.0f;
167	0	for (int i = 0; i < length; ++i) {
168	0	const BLOB_CHOICE *blob_choice = blob_choices[i];
169	0	fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
170	0	rating += blob_choice->rating();
171	0	if (certainty > blob_choice->certainty()) {
172	0	certainty = blob_choice->certainty();
173	0	}
174	0	}
175	0	fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
176	0	}
177
178		// Helper recursively prints all paths through the ratings matrix, starting
179		// at column col.
180		static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
181		const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
182	0	const char label, FILE output_file) {
183	0	for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
184	0	if (ratings.get(col, row) != NOT_CLASSIFIED) {
185	0	BLOB_CHOICE_IT bc_it(ratings.get(col, row));
186	0	for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
187	0	blob_choices[length] = bc_it.data();
188	0	if (row + 1 < dim) {
189	0	PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
190	0	output_file);
191	0	} else {
192	0	PrintPath(length + 1, blob_choices, unicharset, label, output_file);
193	0	}
194	0	}
195	0	}
196	0	}
197	0	}
198
199		// Runs classify_word_pass1() on the current word. Outputs Tesseract's
200		// raw choice as a result of the classification. For words labeled with a
201		// single unichar also outputs all alternatives from blob_choices of the
202		// best choice.
203		void Tesseract::ambigs_classify_and_output(const char label, PAGE_RES_IT pr_it,
204	0	FILE *output_file) {
205		// Classify word.
206	0	fflush(stdout);
207	0	WordData word_data(*pr_it);
208	0	SetupWordPassN(1, &word_data);
209	0	classify_word_and_language(1, pr_it, &word_data);
210	0	WERD_RES *werd_res = word_data.word;
211	0	WERD_CHOICE *best_choice = werd_res->best_choice;
212	0	ASSERT_HOST(best_choice != nullptr);
213
214		// Compute the number of unichars in the label.
215	0	std::vector<UNICHAR_ID> encoding;
216	0	if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
217	0	tprintf("Not outputting illegal unichar %s\n", label);
218	0	return;
219	0	}
220
221		// Dump all paths through the ratings matrix (which is normally small).
222	0	int dim = werd_res->ratings->dimension();
223	0	const auto *blob_choices = new const BLOB_CHOICE [dim];
224	0	PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
225	0	delete[] blob_choices;
226	0	}
227
228		} // namespace tesseract