/src/tesseract/src/ccutil/ambigs.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | ///////////////////////////////////////////////////////////////////////  | 
2  |  | // File:        ambigs.cpp  | 
3  |  | // Description: Functions for dealing with ambiguities  | 
4  |  | //              (training and recognition).  | 
5  |  | // Author:      Daria Antonova  | 
6  |  | //  | 
7  |  | // (C) Copyright 2008, Google Inc.  | 
8  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
9  |  | // you may not use this file except in compliance with the License.  | 
10  |  | // You may obtain a copy of the License at  | 
11  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
12  |  | // Unless required by applicable law or agreed to in writing, software  | 
13  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
14  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
15  |  | // See the License for the specific language governing permissions and  | 
16  |  | // limitations under the License.  | 
17  |  | //  | 
18  |  | ///////////////////////////////////////////////////////////////////////  | 
19  |  |  | 
20  |  | #include "ambigs.h"  | 
21  |  |  | 
22  |  | #include "helpers.h"  | 
23  |  | #include "universalambigs.h"  | 
24  |  |  | 
25  |  | #include <cstdio>  | 
26  |  |  | 
27  |  | #if defined(_WIN32) && !defined(__GNUC__)  | 
28  |  | #  define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)  | 
29  |  | #endif /* _WIN32 && !__GNUC__ */  | 
30  |  |  | 
31  |  | namespace tesseract { | 
32  |  |  | 
33  |  | static const char kAmbigDelimiters[] = "\t ";  | 
34  |  | static const char kIllegalMsg[] = "Illegal ambiguity specification on line %d\n";  | 
35  |  | static const char kIllegalUnicharMsg[] = "Illegal unichar %s in ambiguity specification\n";  | 
36  |  |  | 
37  |  | // Maximum line size:  | 
38  |  | //   10 for sizes of ambigs, tabs, abmig type and newline  | 
39  |  | //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig  | 
40  |  | const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);  | 
41  |  |  | 
42  | 76.1k  | AmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) { | 
43  | 76.1k  |   wrong_ngram[0] = INVALID_UNICHAR_ID;  | 
44  | 76.1k  |   correct_fragments[0] = INVALID_UNICHAR_ID;  | 
45  | 76.1k  | }  | 
46  |  |  | 
47  |  | // Initializes the ambigs by adding a nullptr pointer to each table.  | 
48  | 4  | void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) { | 
49  | 456  |   for (unsigned i = 0; i < unicharset.size(); ++i) { | 
50  | 452  |     replace_ambigs_.push_back(nullptr);  | 
51  | 452  |     dang_ambigs_.push_back(nullptr);  | 
52  | 452  |     one_to_one_definite_ambigs_.push_back(nullptr);  | 
53  | 452  |     if (use_ambigs_for_adaption) { | 
54  | 0  |       ambigs_for_adaption_.push_back(nullptr);  | 
55  | 0  |       reverse_ambigs_for_adaption_.push_back(nullptr);  | 
56  | 0  |     }  | 
57  | 452  |   }  | 
58  | 4  | }  | 
59  |  |  | 
60  |  | // Loads the universal ambigs that are useful for any language.  | 
61  | 4  | void UnicharAmbigs::LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset) { | 
62  | 4  |   TFile file;  | 
63  | 4  |   if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) { | 
64  | 0  |     return;  | 
65  | 0  |   }  | 
66  | 4  |   LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset);  | 
67  | 4  | }  | 
68  |  |  | 
69  |  | void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file,  | 
70  |  |                                       int debug_level, bool use_ambigs_for_adaption,  | 
71  | 8  |                                       UNICHARSET *unicharset) { | 
72  | 8  |   UnicharIdVector *adaption_ambigs_entry;  | 
73  | 8  |   if (debug_level) { | 
74  | 0  |     tprintf("Reading ambiguities\n"); | 
75  | 0  |   }  | 
76  |  |  | 
77  | 8  |   int test_ambig_part_size;  | 
78  | 8  |   int replacement_ambig_part_size;  | 
79  |  |   // The space for buffer is allocated on the heap to avoid  | 
80  |  |   // GCC frame size warning.  | 
81  | 8  |   const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;  | 
82  | 8  |   char *buffer = new char[kBufferSize];  | 
83  | 8  |   char replacement_string[kMaxAmbigStringSize];  | 
84  | 8  |   UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];  | 
85  | 8  |   int line_num = 0;  | 
86  | 8  |   int type = NOT_AMBIG;  | 
87  |  |  | 
88  |  |   // Determine the version of the ambigs file.  | 
89  | 8  |   int version = 0;  | 
90  | 8  |   ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != nullptr && buffer[0] != '\0');  | 
91  | 8  |   if (*buffer == 'v') { | 
92  | 8  |     version = static_cast<int>(strtol(buffer + 1, nullptr, 10));  | 
93  | 8  |     ++line_num;  | 
94  | 8  |   } else { | 
95  | 0  |     ambig_file->Rewind();  | 
96  | 0  |   }  | 
97  | 76.3k  |   while (ambig_file->FGets(buffer, kBufferSize) != nullptr) { | 
98  | 76.3k  |     chomp_string(buffer);  | 
99  | 76.3k  |     if (debug_level > 2) { | 
100  | 0  |       tprintf("read line %s\n", buffer); | 
101  | 0  |     }  | 
102  | 76.3k  |     ++line_num;  | 
103  | 76.3k  |     if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer,  | 
104  | 76.3k  |                             &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size,  | 
105  | 76.3k  |                             replacement_string, &type)) { | 
106  | 228  |       continue;  | 
107  | 228  |     }  | 
108  |  |     // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.  | 
109  | 76.1k  |     auto *ambig_spec = new AmbigSpec();  | 
110  | 76.1k  |     if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,  | 
111  | 76.1k  |                          test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size,  | 
112  | 76.1k  |                          replacement_string, type, ambig_spec, unicharset)) { | 
113  | 228  |       continue;  | 
114  | 228  |     }  | 
115  |  |  | 
116  |  |     // Update one_to_one_definite_ambigs_.  | 
117  | 75.9k  |     if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { | 
118  | 0  |       if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == nullptr) { | 
119  | 0  |         one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();  | 
120  | 0  |       }  | 
121  | 0  |       one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(ambig_spec->correct_ngram_id);  | 
122  | 0  |     }  | 
123  |  |     // Update ambigs_for_adaption_.  | 
124  | 75.9k  |     if (use_ambigs_for_adaption) { | 
125  | 0  |       std::vector<UNICHAR_ID> encoding;  | 
126  |  |       // Silently ignore invalid strings, as before, so it is safe to use a  | 
127  |  |       // universal ambigs file.  | 
128  | 0  |       if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) { | 
129  | 0  |         for (int i = 0; i < test_ambig_part_size; ++i) { | 
130  | 0  |           if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) { | 
131  | 0  |             ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();  | 
132  | 0  |           }  | 
133  | 0  |           adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];  | 
134  | 0  |           for (int id_to_insert : encoding) { | 
135  | 0  |             ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);  | 
136  |  |             // Add the new unichar id to adaption_ambigs_entry (only if the  | 
137  |  |             // vector does not already contain it) keeping it in sorted order.  | 
138  | 0  |             size_t j;  | 
139  | 0  |             for (j = 0;  | 
140  | 0  |                  j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert;  | 
141  | 0  |                  ++j) { | 
142  | 0  |             }  | 
143  | 0  |             if (j < adaption_ambigs_entry->size()) { | 
144  | 0  |               if ((*adaption_ambigs_entry)[j] != id_to_insert) { | 
145  | 0  |                 adaption_ambigs_entry->insert(adaption_ambigs_entry->begin() + j, id_to_insert);  | 
146  | 0  |               }  | 
147  | 0  |             } else { | 
148  | 0  |               adaption_ambigs_entry->push_back(id_to_insert);  | 
149  | 0  |             }  | 
150  | 0  |           }  | 
151  | 0  |         }  | 
152  | 0  |       }  | 
153  | 0  |     }  | 
154  | 75.9k  |   }  | 
155  | 8  |   delete[] buffer;  | 
156  |  |  | 
157  |  |   // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.  | 
158  | 8  |   if (use_ambigs_for_adaption) { | 
159  | 0  |     for (size_t i = 0; i < ambigs_for_adaption_.size(); ++i) { | 
160  | 0  |       adaption_ambigs_entry = ambigs_for_adaption_[i];  | 
161  | 0  |       if (adaption_ambigs_entry == nullptr) { | 
162  | 0  |         continue;  | 
163  | 0  |       }  | 
164  | 0  |       for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { | 
165  | 0  |         UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];  | 
166  | 0  |         if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) { | 
167  | 0  |           reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();  | 
168  | 0  |         }  | 
169  | 0  |         reverse_ambigs_for_adaption_[ambig_id]->push_back(i);  | 
170  | 0  |       }  | 
171  | 0  |     }  | 
172  | 0  |   }  | 
173  |  |  | 
174  |  |   // Print what was read from the input file.  | 
175  | 8  |   if (debug_level > 1) { | 
176  | 0  |     for (int tbl = 0; tbl < 2; ++tbl) { | 
177  | 0  |       const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_;  | 
178  | 0  |       for (size_t i = 0; i < print_table.size(); ++i) { | 
179  | 0  |         AmbigSpec_LIST *lst = print_table[i];  | 
180  | 0  |         if (lst == nullptr) { | 
181  | 0  |           continue;  | 
182  | 0  |         }  | 
183  | 0  |         if (!lst->empty()) { | 
184  | 0  |           tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", | 
185  | 0  |                   unicharset->debug_str(i).c_str());  | 
186  | 0  |         }  | 
187  | 0  |         AmbigSpec_IT lst_it(lst);  | 
188  | 0  |         for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { | 
189  | 0  |           AmbigSpec *ambig_spec = lst_it.data();  | 
190  | 0  |           tprintf("wrong_ngram:"); | 
191  | 0  |           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);  | 
192  | 0  |           tprintf("correct_fragments:"); | 
193  | 0  |           UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);  | 
194  | 0  |         }  | 
195  | 0  |       }  | 
196  | 0  |     }  | 
197  | 0  |     if (use_ambigs_for_adaption) { | 
198  | 0  |       for (int vec_id = 0; vec_id < 2; ++vec_id) { | 
199  | 0  |         const std::vector<UnicharIdVector *> &vec =  | 
200  | 0  |             (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_;  | 
201  | 0  |         for (size_t i = 0; i < vec.size(); ++i) { | 
202  | 0  |           adaption_ambigs_entry = vec[i];  | 
203  | 0  |           if (adaption_ambigs_entry != nullptr) { | 
204  | 0  |             tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ", | 
205  | 0  |                     unicharset->debug_str(i).c_str());  | 
206  | 0  |             for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) { | 
207  | 0  |               tprintf("%s ", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str()); | 
208  | 0  |             }  | 
209  | 0  |             tprintf("\n"); | 
210  | 0  |           }  | 
211  | 0  |         }  | 
212  | 0  |       }  | 
213  | 0  |     }  | 
214  | 0  |   }  | 
215  | 8  | }  | 
216  |  |  | 
217  |  | bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_level,  | 
218  |  |                                        const UNICHARSET &unicharset, char *buffer,  | 
219  |  |                                        int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,  | 
220  |  |                                        int *replacement_ambig_part_size, char *replacement_string,  | 
221  | 76.3k  |                                        int *type) { | 
222  | 76.3k  |   if (version > 1) { | 
223  |  |     // Simpler format is just wrong-string correct-string type\n.  | 
224  | 76.0k  |     std::string input(buffer);  | 
225  | 76.0k  |     std::vector<std::string> fields = split(input, ' ');  | 
226  | 76.0k  |     if (fields.size() != 3) { | 
227  | 4  |       if (debug_level) { | 
228  | 0  |         tprintf(kIllegalMsg, line_num);  | 
229  | 0  |       }  | 
230  | 4  |       return false;  | 
231  | 4  |     }  | 
232  |  |     // Encode wrong-string.  | 
233  | 76.0k  |     std::vector<UNICHAR_ID> unichars;  | 
234  | 76.0k  |     if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr, nullptr)) { | 
235  | 56  |       return false;  | 
236  | 56  |     }  | 
237  | 75.9k  |     *test_ambig_part_size = unichars.size();  | 
238  | 75.9k  |     if (*test_ambig_part_size > MAX_AMBIG_SIZE) { | 
239  | 0  |       if (debug_level) { | 
240  | 0  |         tprintf("Too many unichars in ambiguity on line %d\n", line_num); | 
241  | 0  |       }  | 
242  | 0  |       return false;  | 
243  | 0  |     }  | 
244  |  |     // Copy encoded string to output.  | 
245  | 303k  |     for (size_t i = 0; i < unichars.size(); ++i) { | 
246  | 227k  |       test_unichar_ids[i] = unichars[i];  | 
247  | 227k  |     }  | 
248  | 75.9k  |     test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;  | 
249  |  |     // Encode replacement-string to check validity.  | 
250  | 75.9k  |     if (!unicharset.encode_string(fields[1].c_str(), true, &unichars, nullptr, nullptr)) { | 
251  | 56  |       return false;  | 
252  | 56  |     }  | 
253  | 75.9k  |     *replacement_ambig_part_size = unichars.size();  | 
254  | 75.9k  |     if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { | 
255  | 0  |       if (debug_level) { | 
256  | 0  |         tprintf("Too many unichars in ambiguity on line %d\n", line_num); | 
257  | 0  |       }  | 
258  | 0  |       return false;  | 
259  | 0  |     }  | 
260  | 75.9k  |     if (sscanf(fields[2].c_str(), "%d", type) != 1) { | 
261  | 0  |       if (debug_level) { | 
262  | 0  |         tprintf(kIllegalMsg, line_num);  | 
263  | 0  |       }  | 
264  | 0  |       return false;  | 
265  | 0  |     }  | 
266  | 75.9k  |     snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].c_str());  | 
267  | 75.9k  |     return true;  | 
268  | 75.9k  |   }  | 
269  | 340  |   int i;  | 
270  | 340  |   char *next_token;  | 
271  | 340  |   char *token = strtok_r(buffer, kAmbigDelimiters, &next_token);  | 
272  | 340  |   if (!token || sscanf(token, "%d", test_ambig_part_size) != 1 ||  | 
273  | 340  |       *test_ambig_part_size <= 0) { | 
274  | 0  |     if (debug_level) { | 
275  | 0  |       tprintf(kIllegalMsg, line_num);  | 
276  | 0  |     }  | 
277  | 0  |     return false;  | 
278  | 0  |   }  | 
279  | 340  |   if (*test_ambig_part_size > MAX_AMBIG_SIZE) { | 
280  | 0  |     if (debug_level) { | 
281  | 0  |       tprintf("Too many unichars in ambiguity on line %d\n", line_num); | 
282  | 0  |     }  | 
283  | 0  |     return false;  | 
284  | 0  |   }  | 
285  | 784  |   for (i = 0; i < *test_ambig_part_size; ++i) { | 
286  | 500  |     if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { | 
287  | 0  |       break;  | 
288  | 0  |     }  | 
289  | 500  |     if (!unicharset.contains_unichar(token)) { | 
290  | 56  |       if (debug_level) { | 
291  | 0  |         tprintf(kIllegalUnicharMsg, token);  | 
292  | 0  |       }  | 
293  | 56  |       break;  | 
294  | 56  |     }  | 
295  | 444  |     test_unichar_ids[i] = unicharset.unichar_to_id(token);  | 
296  | 444  |   }  | 
297  | 340  |   test_unichar_ids[i] = INVALID_UNICHAR_ID;  | 
298  |  |  | 
299  | 340  |   if (i != *test_ambig_part_size || !(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) ||  | 
300  | 340  |       sscanf(token, "%d", replacement_ambig_part_size) != 1 ||  | 
301  | 340  |       *replacement_ambig_part_size <= 0) { | 
302  | 56  |     if (debug_level) { | 
303  | 0  |       tprintf(kIllegalMsg, line_num);  | 
304  | 0  |     }  | 
305  | 56  |     return false;  | 
306  | 56  |   }  | 
307  | 284  |   if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { | 
308  | 0  |     if (debug_level) { | 
309  | 0  |       tprintf("Too many unichars in ambiguity on line %d\n", line_num); | 
310  | 0  |     }  | 
311  | 0  |     return false;  | 
312  | 0  |   }  | 
313  | 284  |   replacement_string[0] = '\0';  | 
314  | 596  |   for (i = 0; i < *replacement_ambig_part_size; ++i) { | 
315  | 368  |     if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) { | 
316  | 0  |       break;  | 
317  | 0  |     }  | 
318  | 368  |     strcat(replacement_string, token);  | 
319  | 368  |     if (!unicharset.contains_unichar(token)) { | 
320  | 56  |       if (debug_level) { | 
321  | 0  |         tprintf(kIllegalUnicharMsg, token);  | 
322  | 0  |       }  | 
323  | 56  |       break;  | 
324  | 56  |     }  | 
325  | 368  |   }  | 
326  | 284  |   if (i != *replacement_ambig_part_size) { | 
327  | 56  |     if (debug_level) { | 
328  | 0  |       tprintf(kIllegalMsg, line_num);  | 
329  | 0  |     }  | 
330  | 56  |     return false;  | 
331  | 56  |   }  | 
332  | 228  |   if (version > 0) { | 
333  |  |     // The next field being true indicates that the ambiguity should  | 
334  |  |     // always be substituted (e.g. '' should always be changed to ").  | 
335  |  |     // For such "certain" n -> m ambigs tesseract will insert character  | 
336  |  |     // fragments for the n pieces in the unicharset. AmbigsFound()  | 
337  |  |     // will then replace the incorrect ngram with the character  | 
338  |  |     // fragments of the correct character (or ngram if m > 1).  | 
339  |  |     // Note that if m > 1, an ngram will be inserted into the  | 
340  |  |     // modified word, not the individual unigrams. Tesseract  | 
341  |  |     // has limited support for ngram unichar (e.g. dawg permuter).  | 
342  | 228  |     token = strtok_r(nullptr, kAmbigDelimiters, &next_token);  | 
343  | 228  |     if (!token || sscanf(token, "%d", type) != 1) { | 
344  | 0  |       if (debug_level) { | 
345  | 0  |         tprintf(kIllegalMsg, line_num);  | 
346  | 0  |       }  | 
347  | 0  |       return false;  | 
348  | 0  |     }  | 
349  | 228  |   }  | 
350  | 228  |   return true;  | 
351  | 228  | }  | 
352  |  |  | 
353  |  | bool UnicharAmbigs::InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,  | 
354  |  |                                     UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,  | 
355  |  |                                     const char *replacement_string, int type, AmbigSpec *ambig_spec,  | 
356  | 76.1k  |                                     UNICHARSET *unicharset) { | 
357  | 76.1k  |   ambig_spec->type = static_cast<AmbigType>(type);  | 
358  | 76.1k  |   if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&  | 
359  | 76.1k  |       unicharset->to_lower(test_unichar_ids[0]) ==  | 
360  | 88  |           unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) { | 
361  | 0  |     ambig_spec->type = CASE_AMBIG;  | 
362  | 0  |   }  | 
363  |  |  | 
364  | 76.1k  |   ambig_spec->wrong_ngram_size =  | 
365  | 76.1k  |       UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);  | 
366  |  |  | 
367  |  |   // Since we need to maintain a constant number of unichar positions in  | 
368  |  |   // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for  | 
369  |  |   // each n->m ambiguity we will have to place n character fragments of the  | 
370  |  |   // correct ngram into the corresponding positions in the vector (e.g. given  | 
371  |  |   // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and  | 
372  |  |   // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed  | 
373  |  |   // from fragments by dawg_permute_and_select().  | 
374  |  |  | 
375  |  |   // Insert the corresponding correct ngram into the unicharset.  | 
376  |  |   // Unicharset code assumes that the "base" ngram is inserted into  | 
377  |  |   // the unicharset before fragments of this ngram are inserted.  | 
378  | 76.1k  |   unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);  | 
379  | 76.1k  |   ambig_spec->correct_ngram_id = unicharset->unichar_to_id(replacement_string);  | 
380  | 76.1k  |   if (replacement_ambig_part_size > 1) { | 
381  | 75.8k  |     unicharset->set_isngram(ambig_spec->correct_ngram_id, true);  | 
382  | 75.8k  |   }  | 
383  |  |   // Add the corresponding fragments of the wrong ngram to unicharset.  | 
384  | 76.1k  |   int i;  | 
385  | 303k  |   for (i = 0; i < test_ambig_part_size; ++i) { | 
386  | 227k  |     UNICHAR_ID unichar_id;  | 
387  | 227k  |     if (test_ambig_part_size == 1) { | 
388  | 184  |       unichar_id = ambig_spec->correct_ngram_id;  | 
389  | 227k  |     } else { | 
390  | 227k  |       std::string frag_str =  | 
391  | 227k  |           CHAR_FRAGMENT::to_string(replacement_string, i, test_ambig_part_size, false);  | 
392  | 227k  |       unicharset->unichar_insert(frag_str.c_str(), OldUncleanUnichars::kTrue);  | 
393  | 227k  |       unichar_id = unicharset->unichar_to_id(frag_str.c_str());  | 
394  | 227k  |     }  | 
395  | 227k  |     ambig_spec->correct_fragments[i] = unichar_id;  | 
396  | 227k  |   }  | 
397  | 76.1k  |   ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;  | 
398  |  |  | 
399  |  |   // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.  | 
400  |  |   // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.  | 
401  | 76.1k  |   if (table[test_unichar_ids[0]] == nullptr) { | 
402  | 304  |     table[test_unichar_ids[0]] = new AmbigSpec_LIST();  | 
403  | 304  |   }  | 
404  | 76.1k  |   if (table[test_unichar_ids[0]]->add_sorted(AmbigSpec::compare_ambig_specs, true, ambig_spec)) { | 
405  | 75.9k  |     return true;  | 
406  | 75.9k  |   }  | 
407  | 228  |   delete ambig_spec;  | 
408  | 228  |   return false;  | 
409  | 76.1k  | }  | 
410  |  |  | 
411  |  | } // namespace tesseract  |