/src/tesseract/src/ccmain/recogtraining.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: recogtraining.cpp |
3 | | // Description: Functions for ambiguity and parameter training. |
4 | | // Author: Daria Antonova |
5 | | // |
6 | | // (C) Copyright 2009, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #include "tesseractclass.h" |
20 | | |
21 | | #include "boxread.h" |
22 | | #include "control.h" |
23 | | #include "host.h" // for NearlyEqual |
24 | | #include "ratngs.h" |
25 | | #ifndef DISABLED_LEGACY_ENGINE |
26 | | # include "reject.h" |
27 | | #endif |
28 | | #include "stopper.h" |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | const int16_t kMaxBoxEdgeDiff = 2; |
33 | | |
34 | | // Sets flags necessary for recognition in the training mode. |
35 | | // Opens and returns the pointer to the output file. |
36 | 0 | FILE *Tesseract::init_recog_training(const char *filename) { |
37 | 0 | if (tessedit_ambigs_training) { |
38 | 0 | tessedit_tess_adaption_mode.set_value(0); // turn off adaption |
39 | 0 | tessedit_enable_doc_dict.set_value(false); // turn off document dictionary |
40 | | // Explore all segmentations. |
41 | 0 | getDict().stopper_no_acceptable_choices.set_value(true); |
42 | 0 | } |
43 | |
|
44 | 0 | std::string output_fname = filename; |
45 | 0 | const char *lastdot = strrchr(output_fname.c_str(), '.'); |
46 | 0 | if (lastdot != nullptr) { |
47 | 0 | output_fname[lastdot - output_fname.c_str()] = '\0'; |
48 | 0 | } |
49 | 0 | output_fname += ".txt"; |
50 | 0 | FILE *output_file = fopen(output_fname.c_str(), "a+"); |
51 | 0 | if (output_file == nullptr) { |
52 | 0 | tprintf("Error: Could not open file %s\n", output_fname.c_str()); |
53 | 0 | ASSERT_HOST(output_file); |
54 | 0 | } |
55 | 0 | return output_file; |
56 | 0 | } |
57 | | |
58 | | // Copies the bounding box from page_res_it->word() to the given TBOX. |
59 | 0 | static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { |
60 | 0 | while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) { |
61 | 0 | page_res_it->forward(); |
62 | 0 | } |
63 | |
|
64 | 0 | if (page_res_it->word() != nullptr) { |
65 | 0 | *tbox = page_res_it->word()->word->bounding_box(); |
66 | | |
67 | | // If tbox->left() is negative, the training image has vertical text and |
68 | | // all the coordinates of bounding boxes of page_res are rotated by 90 |
69 | | // degrees in a counterclockwise direction. We need to rotate the TBOX back |
70 | | // in order to compare with the TBOXes of box files. |
71 | 0 | if (tbox->left() < 0) { |
72 | 0 | tbox->rotate(FCOORD(0.0, -1.0)); |
73 | 0 | } |
74 | |
|
75 | 0 | return true; |
76 | 0 | } else { |
77 | 0 | return false; |
78 | 0 | } |
79 | 0 | } |
80 | | |
81 | | // This function takes tif/box pair of files and runs recognition on the image, |
82 | | // while making sure that the word bounds that tesseract identified roughly |
83 | | // match to those specified by the input box file. For each word (ngram in a |
84 | | // single bounding box from the input box file) it outputs the ocred result, |
85 | | // the correct label, rating and certainty. |
86 | | void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res, |
87 | 0 | volatile ETEXT_DESC *monitor, FILE *output_file) { |
88 | 0 | std::string box_fname = filename; |
89 | 0 | const char *lastdot = strrchr(box_fname.c_str(), '.'); |
90 | 0 | if (lastdot != nullptr) { |
91 | 0 | box_fname[lastdot - box_fname.c_str()] = '\0'; |
92 | 0 | } |
93 | 0 | box_fname += ".box"; |
94 | | // ReadNextBox() will close box_file |
95 | 0 | FILE *box_file = fopen(box_fname.c_str(), "r"); |
96 | 0 | if (box_file == nullptr) { |
97 | 0 | tprintf("Error: Could not open file %s\n", box_fname.c_str()); |
98 | 0 | ASSERT_HOST(box_file); |
99 | 0 | } |
100 | |
|
101 | 0 | PAGE_RES_IT page_res_it; |
102 | 0 | page_res_it.page_res = page_res; |
103 | 0 | page_res_it.restart_page(); |
104 | 0 | std::string label; |
105 | | |
106 | | // Process all the words on this page. |
107 | 0 | TBOX tbox; // tesseract-identified box |
108 | 0 | TBOX bbox; // box from the box file |
109 | 0 | bool keep_going; |
110 | 0 | int line_number = 0; |
111 | 0 | int examined_words = 0; |
112 | 0 | do { |
113 | 0 | keep_going = read_t(&page_res_it, &tbox); |
114 | 0 | keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); |
115 | | // Align bottom left points of the TBOXes. |
116 | 0 | while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { |
117 | 0 | if (bbox.bottom() < tbox.bottom()) { |
118 | 0 | page_res_it.forward(); |
119 | 0 | keep_going = read_t(&page_res_it, &tbox); |
120 | 0 | } else { |
121 | 0 | keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); |
122 | 0 | } |
123 | 0 | } |
124 | 0 | while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { |
125 | 0 | if (bbox.left() > tbox.left()) { |
126 | 0 | page_res_it.forward(); |
127 | 0 | keep_going = read_t(&page_res_it, &tbox); |
128 | 0 | } else { |
129 | 0 | keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); |
130 | 0 | } |
131 | 0 | } |
132 | | // OCR the word if top right points of the TBOXes are similar. |
133 | 0 | if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && |
134 | 0 | NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { |
135 | 0 | ambigs_classify_and_output(label.c_str(), &page_res_it, output_file); |
136 | 0 | examined_words++; |
137 | 0 | } |
138 | 0 | page_res_it.forward(); |
139 | 0 | } while (keep_going); |
140 | | |
141 | | // Set up scripts on all of the words that did not get sent to |
142 | | // ambigs_classify_and_output. They all should have, but if all the |
143 | | // werd_res's don't get uch_sets, tesseract will crash when you try |
144 | | // to iterate over them. :-( |
145 | 0 | int total_words = 0; |
146 | 0 | for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) { |
147 | 0 | if (page_res_it.word()) { |
148 | 0 | if (page_res_it.word()->uch_set == nullptr) { |
149 | 0 | page_res_it.word()->SetupFake(unicharset); |
150 | 0 | } |
151 | 0 | total_words++; |
152 | 0 | } |
153 | 0 | } |
154 | 0 | if (examined_words < 0.85 * total_words) { |
155 | 0 | tprintf( |
156 | 0 | "TODO(antonova): clean up recog_training_segmented; " |
157 | 0 | " It examined only a small fraction of the ambigs image.\n"); |
158 | 0 | } |
159 | 0 | tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words); |
160 | 0 | } |
161 | | |
162 | | // Helper prints the given set of blob choices. |
163 | | static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset, |
164 | 0 | const char *label, FILE *output_file) { |
165 | 0 | float rating = 0.0f; |
166 | 0 | float certainty = 0.0f; |
167 | 0 | for (int i = 0; i < length; ++i) { |
168 | 0 | const BLOB_CHOICE *blob_choice = blob_choices[i]; |
169 | 0 | fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id())); |
170 | 0 | rating += blob_choice->rating(); |
171 | 0 | if (certainty > blob_choice->certainty()) { |
172 | 0 | certainty = blob_choice->certainty(); |
173 | 0 | } |
174 | 0 | } |
175 | 0 | fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty); |
176 | 0 | } |
177 | | |
178 | | // Helper recursively prints all paths through the ratings matrix, starting |
179 | | // at column col. |
180 | | static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length, |
181 | | const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset, |
182 | 0 | const char *label, FILE *output_file) { |
183 | 0 | for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) { |
184 | 0 | if (ratings.get(col, row) != NOT_CLASSIFIED) { |
185 | 0 | BLOB_CHOICE_IT bc_it(ratings.get(col, row)); |
186 | 0 | for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { |
187 | 0 | blob_choices[length] = bc_it.data(); |
188 | 0 | if (row + 1 < dim) { |
189 | 0 | PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label, |
190 | 0 | output_file); |
191 | 0 | } else { |
192 | 0 | PrintPath(length + 1, blob_choices, unicharset, label, output_file); |
193 | 0 | } |
194 | 0 | } |
195 | 0 | } |
196 | 0 | } |
197 | 0 | } |
198 | | |
199 | | // Runs classify_word_pass1() on the current word. Outputs Tesseract's |
200 | | // raw choice as a result of the classification. For words labeled with a |
201 | | // single unichar also outputs all alternatives from blob_choices of the |
202 | | // best choice. |
203 | | void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, |
204 | 0 | FILE *output_file) { |
205 | | // Classify word. |
206 | 0 | fflush(stdout); |
207 | 0 | WordData word_data(*pr_it); |
208 | 0 | SetupWordPassN(1, &word_data); |
209 | 0 | classify_word_and_language(1, pr_it, &word_data); |
210 | 0 | WERD_RES *werd_res = word_data.word; |
211 | 0 | WERD_CHOICE *best_choice = werd_res->best_choice; |
212 | 0 | ASSERT_HOST(best_choice != nullptr); |
213 | | |
214 | | // Compute the number of unichars in the label. |
215 | 0 | std::vector<UNICHAR_ID> encoding; |
216 | 0 | if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) { |
217 | 0 | tprintf("Not outputting illegal unichar %s\n", label); |
218 | 0 | return; |
219 | 0 | } |
220 | | |
221 | | // Dump all paths through the ratings matrix (which is normally small). |
222 | 0 | int dim = werd_res->ratings->dimension(); |
223 | 0 | const auto **blob_choices = new const BLOB_CHOICE *[dim]; |
224 | 0 | PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file); |
225 | 0 | delete[] blob_choices; |
226 | 0 | } |
227 | | |
228 | | } // namespace tesseract |