/src/tesseract/src/ccutil/ambigs.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: ambigs.cpp |
3 | | // Description: Functions for dealing with ambiguities |
4 | | // (training and recognition). |
5 | | // Author: Daria Antonova |
6 | | // |
7 | | // (C) Copyright 2008, Google Inc. |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | // you may not use this file except in compliance with the License. |
10 | | // You may obtain a copy of the License at |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // Unless required by applicable law or agreed to in writing, software |
13 | | // distributed under the License is distributed on an "AS IS" BASIS, |
14 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | // See the License for the specific language governing permissions and |
16 | | // limitations under the License. |
17 | | // |
18 | | /////////////////////////////////////////////////////////////////////// |
19 | | |
20 | | #include "ambigs.h" |
21 | | |
22 | | #include "helpers.h" |
23 | | #include "universalambigs.h" |
24 | | |
25 | | #include <cstdio> |
26 | | |
27 | | #if defined(_WIN32) && !defined(__GNUC__) |
28 | | # define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) |
29 | | #endif /* _WIN32 && !__GNUC__ */ |
30 | | |
31 | | namespace tesseract { |
32 | | |
33 | | static const char kAmbigDelimiters[] = "\t "; |
34 | | static const char kIllegalMsg[] = "Illegal ambiguity specification on line %d\n"; |
35 | | static const char kIllegalUnicharMsg[] = "Illegal unichar %s in ambiguity specification\n"; |
36 | | |
37 | | // Maximum line size: |
38 | | // 10 for sizes of ambigs, tabs, abmig type and newline |
39 | | // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig |
40 | | const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); |
41 | | |
42 | 76.1k | AmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) { |
43 | 76.1k | wrong_ngram[0] = INVALID_UNICHAR_ID; |
44 | 76.1k | correct_fragments[0] = INVALID_UNICHAR_ID; |
45 | 76.1k | } |
46 | | |
47 | | // Initializes the ambigs by adding a nullptr pointer to each table. |
48 | 4 | void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) { |
49 | 456 | for (unsigned i = 0; i < unicharset.size(); ++i) { |
50 | 452 | replace_ambigs_.push_back(nullptr); |
51 | 452 | dang_ambigs_.push_back(nullptr); |
52 | 452 | one_to_one_definite_ambigs_.push_back(nullptr); |
53 | 452 | if (use_ambigs_for_adaption) { |
54 | 0 | ambigs_for_adaption_.push_back(nullptr); |
55 | 0 | reverse_ambigs_for_adaption_.push_back(nullptr); |
56 | 0 | } |
57 | 452 | } |
58 | 4 | } |
59 | | |
60 | | // Loads the universal ambigs that are useful for any language. |
61 | 4 | void UnicharAmbigs::LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset) { |
62 | 4 | TFile file; |
63 | 4 | if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) { |
64 | 0 | return; |
65 | 0 | } |
66 | 4 | LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset); |
67 | 4 | } |
68 | | |
69 | | void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file, |
70 | | int debug_level, bool use_ambigs_for_adaption, |
71 | 8 | UNICHARSET *unicharset) { |
72 | 8 | UnicharIdVector *adaption_ambigs_entry; |
73 | 8 | if (debug_level) { |
74 | 0 | tprintf("Reading ambiguities\n"); |
75 | 0 | } |
76 | | |
77 | 8 | int test_ambig_part_size; |
78 | 8 | int replacement_ambig_part_size; |
79 | | // The space for buffer is allocated on the heap to avoid |
80 | | // GCC frame size warning. |
81 | 8 | const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; |
82 | 8 | char *buffer = new char[kBufferSize]; |
83 | 8 | char replacement_string[kMaxAmbigStringSize]; |
84 | 8 | UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; |
85 | 8 | int line_num = 0; |
86 | 8 | int type = NOT_AMBIG; |
87 | | |
88 | | // Determine the version of the ambigs file. |
89 | 8 | int version = 0; |
90 | 8 | ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != nullptr && buffer[0] != '\0'); |
91 | 8 | if (*buffer == 'v') { |
92 | 8 | version = static_cast<int>(strtol(buffer + 1, nullptr, 10)); |
93 | 8 | ++line_num; |
94 | 8 | } else { |
95 | 0 | ambig_file->Rewind(); |
96 | 0 | } |
97 | 76.3k | while (ambig_file->FGets(buffer, kBufferSize) != nullptr) { |
98 | 76.3k | chomp_string(buffer); |
99 | 76.3k | if (debug_level > 2) { |
100 | 0 | tprintf("read line %s\n", buffer); |
101 | 0 | } |
102 | 76.3k | ++line_num; |
103 | 76.3k | if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer, |
104 | 76.3k | &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size, |
105 | 76.3k | replacement_string, &type)) { |
106 | 228 | continue; |
107 | 228 | } |
108 | | // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. |
109 | 76.1k | auto *ambig_spec = new AmbigSpec(); |
110 | 76.1k | if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, |
111 | 76.1k | test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size, |
112 | 76.1k | replacement_string, type, ambig_spec, unicharset)) { |
113 | 228 | continue; |
114 | 228 | } |
115 | | |
116 | | // Update one_to_one_definite_ambigs_. |
117 | 75.9k | if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { |
118 | 0 | if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == nullptr) { |
119 | 0 | one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); |
120 | 0 | } |
121 | 0 | one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(ambig_spec->correct_ngram_id); |
122 | 0 | } |
123 | | // Update ambigs_for_adaption_. |
124 | 75.9k | if (use_ambigs_for_adaption) { |
125 | 0 | std::vector<UNICHAR_ID> encoding; |
126 | | // Silently ignore invalid strings, as before, so it is safe to use a |
127 | | // universal ambigs file. |
128 | 0 | if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) { |
129 | 0 | for (int i = 0; i < test_ambig_part_size; ++i) { |
130 | 0 | if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) { |
131 | 0 | ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); |
132 | 0 | } |
133 | 0 | adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; |
134 | 0 | for (int id_to_insert : encoding) { |
135 | 0 | ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); |
136 | | // Add the new unichar id to adaption_ambigs_entry (only if the |
137 | | // vector does not already contain it) keeping it in sorted order. |
138 | 0 | size_t j; |
139 | 0 | for (j = 0; |
140 | 0 | j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert; |
141 | 0 | ++j) { |
142 | 0 | } |
143 | 0 | if (j < adaption_ambigs_entry->size()) { |
144 | 0 | if ((*adaption_ambigs_entry)[j] != id_to_insert) { |
145 | 0 | adaption_ambigs_entry->insert(adaption_ambigs_entry->begin() + j, id_to_insert); |
146 | 0 | } |
147 | 0 | } else { |
148 | 0 | adaption_ambigs_entry->push_back(id_to_insert); |
149 | 0 | } |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } |
153 | 0 | } |
154 | 75.9k | } |
155 | 8 | delete[] buffer; |
156 | | |
157 | | // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. |
158 | 8 | if (use_ambigs_for_adaption) { |
159 | 0 | for (size_t i = 0; i < ambigs_for_adaption_.size(); ++i) { |
160 | 0 | adaption_ambigs_entry = ambigs_for_adaption_[i]; |
161 | 0 | if (adaption_ambigs_entry == nullptr) { |
162 | 0 | continue; |
163 | 0 | } |
164 | 0 | for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { |
165 | 0 | UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; |
166 | 0 | if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) { |
167 | 0 | reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); |
168 | 0 | } |
169 | 0 | reverse_ambigs_for_adaption_[ambig_id]->push_back(i); |
170 | 0 | } |
171 | 0 | } |
172 | 0 | } |
173 | | |
174 | | // Print what was read from the input file. |
175 | 8 | if (debug_level > 1) { |
176 | 0 | for (int tbl = 0; tbl < 2; ++tbl) { |
177 | 0 | const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; |
178 | 0 | for (size_t i = 0; i < print_table.size(); ++i) { |
179 | 0 | AmbigSpec_LIST *lst = print_table[i]; |
180 | 0 | if (lst == nullptr) { |
181 | 0 | continue; |
182 | 0 | } |
183 | 0 | if (!lst->empty()) { |
184 | 0 | tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", |
185 | 0 | unicharset->debug_str(i).c_str()); |
186 | 0 | } |
187 | 0 | AmbigSpec_IT lst_it(lst); |
188 | 0 | for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { |
189 | 0 | AmbigSpec *ambig_spec = lst_it.data(); |
190 | 0 | tprintf("wrong_ngram:"); |
191 | 0 | UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); |
192 | 0 | tprintf("correct_fragments:"); |
193 | 0 | UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); |
194 | 0 | } |
195 | 0 | } |
196 | 0 | } |
197 | 0 | if (use_ambigs_for_adaption) { |
198 | 0 | for (int vec_id = 0; vec_id < 2; ++vec_id) { |
199 | 0 | const std::vector<UnicharIdVector *> &vec = |
200 | 0 | (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_; |
201 | 0 | for (size_t i = 0; i < vec.size(); ++i) { |
202 | 0 | adaption_ambigs_entry = vec[i]; |
203 | 0 | if (adaption_ambigs_entry != nullptr) { |
204 | 0 | tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ", |
205 | 0 | unicharset->debug_str(i).c_str()); |
206 | 0 | for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { |
207 | 0 | tprintf("%s ", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str()); |
208 | 0 | } |
209 | 0 | tprintf("\n"); |
210 | 0 | } |
211 | 0 | } |
212 | 0 | } |
213 | 0 | } |
214 | 0 | } |
215 | 8 | } |
216 | | |
217 | | bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_level, |
218 | | const UNICHARSET &unicharset, char *buffer, |
219 | | int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, |
220 | | int *replacement_ambig_part_size, char *replacement_string, |
221 | 76.3k | int *type) { |
222 | 76.3k | if (version > 1) { |
223 | | // Simpler format is just wrong-string correct-string type\n. |
224 | 76.0k | std::string input(buffer); |
225 | 76.0k | std::vector<std::string> fields = split(input, ' '); |
226 | 76.0k | if (fields.size() != 3) { |
227 | 4 | if (debug_level) { |
228 | 0 | tprintf(kIllegalMsg, line_num); |
229 | 0 | } |
230 | 4 | return false; |
231 | 4 | } |
232 | | // Encode wrong-string. |
233 | 76.0k | std::vector<UNICHAR_ID> unichars; |
234 | 76.0k | if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr, nullptr)) { |
235 | 56 | return false; |
236 | 56 | } |
237 | 75.9k | *test_ambig_part_size = unichars.size(); |
238 | 75.9k | if (*test_ambig_part_size > MAX_AMBIG_SIZE) { |
239 | 0 | if (debug_level) { |
240 | 0 | tprintf("Too many unichars in ambiguity on line %d\n", line_num); |
241 | 0 | } |
242 | 0 | return false; |
243 | 0 | } |
244 | | // Copy encoded string to output. |
245 | 303k | for (size_t i = 0; i < unichars.size(); ++i) { |
246 | 227k | test_unichar_ids[i] = unichars[i]; |
247 | 227k | } |
248 | 75.9k | test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; |
249 | | // Encode replacement-string to check validity. |
250 | 75.9k | if (!unicharset.encode_string(fields[1].c_str(), true, &unichars, nullptr, nullptr)) { |
251 | 56 | return false; |
252 | 56 | } |
253 | 75.9k | *replacement_ambig_part_size = unichars.size(); |
254 | 75.9k | if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { |
255 | 0 | if (debug_level) { |
256 | 0 | tprintf("Too many unichars in ambiguity on line %d\n", line_num); |
257 | 0 | } |
258 | 0 | return false; |
259 | 0 | } |
260 | 75.9k | if (sscanf(fields[2].c_str(), "%d", type) != 1) { |
261 | 0 | if (debug_level) { |
262 | 0 | tprintf(kIllegalMsg, line_num); |
263 | 0 | } |
264 | 0 | return false; |
265 | 0 | } |
266 | 75.9k | snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].c_str()); |
267 | 75.9k | return true; |
268 | 75.9k | } |
269 | 340 | int i; |
270 | 340 | char *next_token; |
271 | 340 | char *token = strtok_r(buffer, kAmbigDelimiters, &next_token); |
272 | 340 | if (!token || sscanf(token, "%d", test_ambig_part_size) != 1 || |
273 | 340 | *test_ambig_part_size <= 0) { |
274 | 0 | if (debug_level) { |
275 | 0 | tprintf(kIllegalMsg, line_num); |
276 | 0 | } |
277 | 0 | return false; |
278 | 0 | } |
279 | 340 | if (*test_ambig_part_size > MAX_AMBIG_SIZE) { |
280 | 0 | if (debug_level) { |
281 | 0 | tprintf("Too many unichars in ambiguity on line %d\n", line_num); |
282 | 0 | } |
283 | 0 | return false; |
284 | 0 | } |
285 | 784 | for (i = 0; i < *test_ambig_part_size; ++i) { |
286 | 500 | if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { |
287 | 0 | break; |
288 | 0 | } |
289 | 500 | if (!unicharset.contains_unichar(token)) { |
290 | 56 | if (debug_level) { |
291 | 0 | tprintf(kIllegalUnicharMsg, token); |
292 | 0 | } |
293 | 56 | break; |
294 | 56 | } |
295 | 444 | test_unichar_ids[i] = unicharset.unichar_to_id(token); |
296 | 444 | } |
297 | 340 | test_unichar_ids[i] = INVALID_UNICHAR_ID; |
298 | | |
299 | 340 | if (i != *test_ambig_part_size || !(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) || |
300 | 340 | sscanf(token, "%d", replacement_ambig_part_size) != 1 || |
301 | 340 | *replacement_ambig_part_size <= 0) { |
302 | 56 | if (debug_level) { |
303 | 0 | tprintf(kIllegalMsg, line_num); |
304 | 0 | } |
305 | 56 | return false; |
306 | 56 | } |
307 | 284 | if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { |
308 | 0 | if (debug_level) { |
309 | 0 | tprintf("Too many unichars in ambiguity on line %d\n", line_num); |
310 | 0 | } |
311 | 0 | return false; |
312 | 0 | } |
313 | 284 | replacement_string[0] = '\0'; |
314 | 596 | for (i = 0; i < *replacement_ambig_part_size; ++i) { |
315 | 368 | if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { |
316 | 0 | break; |
317 | 0 | } |
318 | 368 | strcat(replacement_string, token); |
319 | 368 | if (!unicharset.contains_unichar(token)) { |
320 | 56 | if (debug_level) { |
321 | 0 | tprintf(kIllegalUnicharMsg, token); |
322 | 0 | } |
323 | 56 | break; |
324 | 56 | } |
325 | 368 | } |
326 | 284 | if (i != *replacement_ambig_part_size) { |
327 | 56 | if (debug_level) { |
328 | 0 | tprintf(kIllegalMsg, line_num); |
329 | 0 | } |
330 | 56 | return false; |
331 | 56 | } |
332 | 228 | if (version > 0) { |
333 | | // The next field being true indicates that the ambiguity should |
334 | | // always be substituted (e.g. '' should always be changed to "). |
335 | | // For such "certain" n -> m ambigs tesseract will insert character |
336 | | // fragments for the n pieces in the unicharset. AmbigsFound() |
337 | | // will then replace the incorrect ngram with the character |
338 | | // fragments of the correct character (or ngram if m > 1). |
339 | | // Note that if m > 1, an ngram will be inserted into the |
340 | | // modified word, not the individual unigrams. Tesseract |
341 | | // has limited support for ngram unichar (e.g. dawg permuter). |
342 | 228 | token = strtok_r(nullptr, kAmbigDelimiters, &next_token); |
343 | 228 | if (!token || sscanf(token, "%d", type) != 1) { |
344 | 0 | if (debug_level) { |
345 | 0 | tprintf(kIllegalMsg, line_num); |
346 | 0 | } |
347 | 0 | return false; |
348 | 0 | } |
349 | 228 | } |
350 | 228 | return true; |
351 | 228 | } |
352 | | |
353 | | bool UnicharAmbigs::InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size, |
354 | | UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, |
355 | | const char *replacement_string, int type, AmbigSpec *ambig_spec, |
356 | 76.1k | UNICHARSET *unicharset) { |
357 | 76.1k | ambig_spec->type = static_cast<AmbigType>(type); |
358 | 76.1k | if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && |
359 | 76.1k | unicharset->to_lower(test_unichar_ids[0]) == |
360 | 88 | unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) { |
361 | 0 | ambig_spec->type = CASE_AMBIG; |
362 | 0 | } |
363 | | |
364 | 76.1k | ambig_spec->wrong_ngram_size = |
365 | 76.1k | UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram); |
366 | | |
367 | | // Since we need to maintain a constant number of unichar positions in |
368 | | // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for |
369 | | // each n->m ambiguity we will have to place n character fragments of the |
370 | | // correct ngram into the corresponding positions in the vector (e.g. given |
371 | | // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and |
372 | | // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed |
373 | | // from fragments by dawg_permute_and_select(). |
374 | | |
375 | | // Insert the corresponding correct ngram into the unicharset. |
376 | | // Unicharset code assumes that the "base" ngram is inserted into |
377 | | // the unicharset before fragments of this ngram are inserted. |
378 | 76.1k | unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue); |
379 | 76.1k | ambig_spec->correct_ngram_id = unicharset->unichar_to_id(replacement_string); |
380 | 76.1k | if (replacement_ambig_part_size > 1) { |
381 | 75.8k | unicharset->set_isngram(ambig_spec->correct_ngram_id, true); |
382 | 75.8k | } |
383 | | // Add the corresponding fragments of the wrong ngram to unicharset. |
384 | 76.1k | int i; |
385 | 303k | for (i = 0; i < test_ambig_part_size; ++i) { |
386 | 227k | UNICHAR_ID unichar_id; |
387 | 227k | if (test_ambig_part_size == 1) { |
388 | 184 | unichar_id = ambig_spec->correct_ngram_id; |
389 | 227k | } else { |
390 | 227k | std::string frag_str = |
391 | 227k | CHAR_FRAGMENT::to_string(replacement_string, i, test_ambig_part_size, false); |
392 | 227k | unicharset->unichar_insert(frag_str.c_str(), OldUncleanUnichars::kTrue); |
393 | 227k | unichar_id = unicharset->unichar_to_id(frag_str.c_str()); |
394 | 227k | } |
395 | 227k | ambig_spec->correct_fragments[i] = unichar_id; |
396 | 227k | } |
397 | 76.1k | ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID; |
398 | | |
399 | | // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST. |
400 | | // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram. |
401 | 76.1k | if (table[test_unichar_ids[0]] == nullptr) { |
402 | 304 | table[test_unichar_ids[0]] = new AmbigSpec_LIST(); |
403 | 304 | } |
404 | 76.1k | if (table[test_unichar_ids[0]]->add_sorted(AmbigSpec::compare_ambig_specs, true, ambig_spec)) { |
405 | 75.9k | return true; |
406 | 75.9k | } |
407 | 228 | delete ambig_spec; |
408 | 228 | return false; |
409 | 76.1k | } |
410 | | |
411 | | } // namespace tesseract |