/src/tesseract/src/ccstruct/boxread.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | * File: boxread.cpp |
3 | | * Description: Read data from a box file. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 2007, Google Inc. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #include "boxread.h" |
20 | | |
21 | | #include "errcode.h" // for ERRCODE, TESSEXIT |
22 | | #include "fileerr.h" // for CANTOPENFILE |
23 | | #include "rect.h" // for TBOX |
24 | | #include "tprintf.h" // for tprintf |
25 | | |
26 | | #include <tesseract/unichar.h> // for UNICHAR |
27 | | #include "helpers.h" // for chomp_string |
28 | | |
29 | | #include <climits> // for INT_MAX |
30 | | #include <cstring> // for strchr, strcmp |
31 | | #include <fstream> // for std::ifstream |
32 | | #include <locale> // for std::locale::classic |
33 | | #include <sstream> // for std::stringstream |
34 | | #include <string> // for std::string |
35 | | |
36 | | namespace tesseract { |
37 | | |
38 | | // Special char code used to identify multi-blob labels. |
39 | | static const char *kMultiBlobLabelCode = "WordStr"; |
40 | | |
41 | | // Returns the box file name corresponding to the given image_filename. |
42 | 0 | static std::string BoxFileName(const char *image_filename) { |
43 | 0 | std::string box_filename = image_filename; |
44 | 0 | size_t length = box_filename.length(); |
45 | 0 | std::string last = (length > 8) ? box_filename.substr(length - 8) : ""; |
46 | 0 | if (last == ".bin.png" || last == ".nrm.png" || last == ".raw.png") { |
47 | 0 | box_filename.resize(length - 8); |
48 | 0 | } else { |
49 | 0 | size_t lastdot = box_filename.find_last_of('.'); |
50 | 0 | if (lastdot < length) { |
51 | 0 | box_filename.resize(lastdot); |
52 | 0 | } |
53 | 0 | } |
54 | 0 | box_filename += ".box"; |
55 | 0 | return box_filename; |
56 | 0 | } |
57 | | |
58 | | // Open the boxfile based on the given image filename. |
59 | 0 | FILE *OpenBoxFile(const char *fname) { |
60 | 0 | std::string filename = BoxFileName(fname); |
61 | 0 | FILE *box_file = nullptr; |
62 | 0 | if (!(box_file = fopen(filename.c_str(), "rb"))) { |
63 | 0 | CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.c_str()); |
64 | 0 | tprintf("Can't open box file %s", filename.c_str()); |
65 | 0 | } |
66 | 0 | return box_file; |
67 | 0 | } |
68 | | |
69 | | // Reads all boxes from the given filename. |
70 | | // Reads a specific target_page number if >= 0, or all pages otherwise. |
71 | | // Skips blanks if skip_blanks is true. |
72 | | // The UTF-8 label of the box is put in texts, and the full box definition as |
73 | | // a string is put in box_texts, with the corresponding page number in pages. |
74 | | // Each of the output vectors is optional (may be nullptr). |
75 | | // Returns false if no boxes are found. |
76 | | bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes, |
77 | | std::vector<std::string> *texts, std::vector<std::string> *box_texts, |
78 | 0 | std::vector<int> *pages) { |
79 | 0 | std::ifstream input(BoxFileName(filename), std::ios::in | std::ios::binary); |
80 | 0 | if (input.fail()) { |
81 | 0 | tprintf("Cannot read box data from '%s'.\n", BoxFileName(filename).c_str()); |
82 | 0 | tprintf("Does it exists?\n"); |
83 | 0 | return false; |
84 | 0 | } |
85 | 0 | std::vector<char> box_data(std::istreambuf_iterator<char>(input), {}); |
86 | 0 | if (box_data.empty()) { |
87 | 0 | tprintf("No box data found in '%s'.\n", BoxFileName(filename).c_str()); |
88 | 0 | return false; |
89 | 0 | } |
90 | | // Convert the array of bytes to a string, so it can be used by the parser. |
91 | 0 | box_data.push_back('\0'); |
92 | 0 | return ReadMemBoxes(target_page, skip_blanks, &box_data[0], |
93 | 0 | /*continue_on_failure*/ true, boxes, texts, box_texts, pages); |
94 | 0 | } |
95 | | |
96 | | // Reads all boxes from the string. Otherwise, as ReadAllBoxes. |
97 | | bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, |
98 | | std::vector<TBOX> *boxes, std::vector<std::string> *texts, |
99 | 0 | std::vector<std::string> *box_texts, std::vector<int> *pages) { |
100 | 0 | std::string box_str(box_data); |
101 | 0 | std::vector<std::string> lines = split(box_str, '\n'); |
102 | 0 | if (lines.empty()) { |
103 | 0 | return false; |
104 | 0 | } |
105 | 0 | int num_boxes = 0; |
106 | 0 | for (auto &line : lines) { |
107 | 0 | int page = 0; |
108 | 0 | std::string utf8_str; |
109 | 0 | TBOX box; |
110 | 0 | if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) { |
111 | 0 | if (continue_on_failure) { |
112 | 0 | continue; |
113 | 0 | } else { |
114 | 0 | return false; |
115 | 0 | } |
116 | 0 | } |
117 | 0 | if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) { |
118 | 0 | continue; |
119 | 0 | } |
120 | 0 | if (target_page >= 0 && page != target_page) { |
121 | 0 | continue; |
122 | 0 | } |
123 | 0 | if (boxes != nullptr) { |
124 | 0 | boxes->push_back(box); |
125 | 0 | } |
126 | 0 | if (texts != nullptr) { |
127 | 0 | texts->push_back(utf8_str); |
128 | 0 | } |
129 | 0 | if (box_texts != nullptr) { |
130 | 0 | std::string full_text; |
131 | 0 | MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text); |
132 | 0 | box_texts->push_back(full_text); |
133 | 0 | } |
134 | 0 | if (pages != nullptr) { |
135 | 0 | pages->push_back(page); |
136 | 0 | } |
137 | 0 | ++num_boxes; |
138 | 0 | } |
139 | 0 | return num_boxes > 0; |
140 | 0 | } |
141 | | |
142 | | // TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes. |
143 | | // Box files are used ONLY DURING TRAINING, but by both processes of |
144 | | // creating tr files with tesseract, and unicharset_extractor. |
145 | | // ReadNextBox factors out the code to interpret a line of a box |
146 | | // file so that applybox and unicharset_extractor interpret the same way. |
147 | | // This function returns the next valid box file utf8 string and coords |
148 | | // and returns true, or false on eof (and closes the file). |
149 | | // It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks |
150 | | // for valid utf-8 and allows space or tab between fields. |
151 | | // utf8_str is set with the unichar string, and bounding box with the box. |
152 | | // If there are page numbers in the file, it reads them all. |
153 | 0 | bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) { |
154 | 0 | return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box); |
155 | 0 | } |
156 | | |
157 | | // As ReadNextBox above, but get a specific page number. (0-based) |
158 | | // Use -1 to read any page number. Files without page number all |
159 | | // read as if they are page 0. |
160 | | bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str, |
161 | 0 | TBOX *bounding_box) { |
162 | 0 | int page = 0; |
163 | 0 | char buff[kBoxReadBufSize]; // boxfile read buffer |
164 | 0 | char *buffptr = buff; |
165 | |
|
166 | 0 | while (fgets(buff, sizeof(buff) - 1, box_file)) { |
167 | 0 | (*line_number)++; |
168 | |
|
169 | 0 | buffptr = buff; |
170 | 0 | const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr); |
171 | 0 | if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) { |
172 | 0 | buffptr += 3; // Skip unicode file designation. |
173 | 0 | } |
174 | | // Check for blank lines in box file |
175 | 0 | if (*buffptr == '\n' || *buffptr == '\0') { |
176 | 0 | continue; |
177 | 0 | } |
178 | | // Skip blank boxes. |
179 | 0 | if (*buffptr == ' ' || *buffptr == '\t') { |
180 | 0 | continue; |
181 | 0 | } |
182 | 0 | if (*buffptr != '\0') { |
183 | 0 | if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) { |
184 | 0 | tprintf("Box file format error on line %i; ignored\n", *line_number); |
185 | 0 | continue; |
186 | 0 | } |
187 | 0 | if (target_page >= 0 && target_page != page) { |
188 | 0 | continue; // Not on the appropriate page. |
189 | 0 | } |
190 | 0 | return true; // Successfully read a box. |
191 | 0 | } |
192 | 0 | } |
193 | 0 | fclose(box_file); |
194 | 0 | return false; // EOF |
195 | 0 | } |
196 | | |
197 | | // Parses the given box file string into a page_number, utf8_str, and |
198 | | // bounding_box. Returns true on a successful parse. |
199 | | // The box file is assumed to contain box definitions, one per line, of the |
200 | | // following format for blob-level boxes: |
201 | | // <UTF8 str> <left> <bottom> <right> <top> <page id> |
202 | | // and for word/line-level boxes: |
203 | | // WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> |
204 | | // See applyybox.cpp for more information. |
205 | | bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, |
206 | 0 | TBOX *bounding_box) { |
207 | 0 | *bounding_box = TBOX(); // Initialize it to empty. |
208 | 0 | utf8_str = ""; |
209 | 0 | char uch[kBoxReadBufSize]; |
210 | 0 | const char *buffptr = boxfile_str; |
211 | | // Read the unichar without messing up on Tibetan. |
212 | | // According to issue 253 the utf-8 surrogates 85 and A0 are treated |
213 | | // as whitespace by sscanf, so it is more reliable to just find |
214 | | // ascii space and tab. |
215 | 0 | int uch_len = 0; |
216 | | // Skip unicode file designation, if present. |
217 | 0 | const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr); |
218 | 0 | if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) { |
219 | 0 | buffptr += 3; |
220 | 0 | } |
221 | | // Allow a single blank as the UTF-8 string. Check for empty string and |
222 | | // then blindly eat the first character. |
223 | 0 | if (*buffptr == '\0') { |
224 | 0 | return false; |
225 | 0 | } |
226 | 0 | do { |
227 | 0 | uch[uch_len++] = *buffptr++; |
228 | 0 | } while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' && |
229 | 0 | uch_len < kBoxReadBufSize - 1); |
230 | 0 | uch[uch_len] = '\0'; |
231 | 0 | if (*buffptr != '\0') { |
232 | 0 | ++buffptr; |
233 | 0 | } |
234 | 0 | int x_min = INT_MAX; |
235 | 0 | int y_min = INT_MAX; |
236 | 0 | int x_max = INT_MIN; |
237 | 0 | int y_max = INT_MIN; |
238 | 0 | *page_number = 0; |
239 | 0 | std::stringstream stream(buffptr); |
240 | 0 | stream.imbue(std::locale::classic()); |
241 | 0 | stream >> x_min; |
242 | 0 | stream >> y_min; |
243 | 0 | stream >> x_max; |
244 | 0 | stream >> y_max; |
245 | 0 | stream >> *page_number; |
246 | 0 | if (x_max < x_min || y_max < y_min) { |
247 | 0 | tprintf("Bad box coordinates in boxfile string! %s\n", ubuf); |
248 | 0 | return false; |
249 | 0 | } |
250 | | // Test for long space-delimited string label. |
251 | 0 | if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) { |
252 | 0 | strncpy(uch, buffptr + 1, kBoxReadBufSize - 1); |
253 | 0 | uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun. |
254 | 0 | chomp_string(uch); |
255 | 0 | uch_len = strlen(uch); |
256 | 0 | } |
257 | | // Validate UTF8 by making unichars with it. |
258 | 0 | int used = 0; |
259 | 0 | while (used < uch_len) { |
260 | 0 | tesseract::UNICHAR ch(uch + used, uch_len - used); |
261 | 0 | int new_used = ch.utf8_len(); |
262 | 0 | if (new_used == 0) { |
263 | 0 | tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1); |
264 | 0 | return false; |
265 | 0 | } |
266 | 0 | used += new_used; |
267 | 0 | } |
268 | 0 | utf8_str = uch; |
269 | 0 | if (x_min > x_max) { |
270 | 0 | std::swap(x_min, x_max); |
271 | 0 | } |
272 | 0 | if (y_min > y_max) { |
273 | 0 | std::swap(y_min, y_max); |
274 | 0 | } |
275 | 0 | bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max); |
276 | 0 | return true; // Successfully read a box. |
277 | 0 | } |
278 | | |
279 | | // Creates a box file string from a unichar string, TBOX and page number. |
280 | 0 | void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) { |
281 | 0 | box_str = unichar_str; |
282 | 0 | box_str += " " + std::to_string(box.left()); |
283 | 0 | box_str += " " + std::to_string(box.bottom()); |
284 | 0 | box_str += " " + std::to_string(box.right()); |
285 | 0 | box_str += " " + std::to_string(box.top()); |
286 | 0 | box_str += " " + std::to_string(page_num); |
287 | 0 | } |
288 | | |
289 | | } // namespace tesseract |