/src/tesseract/src/ccutil/ambigs.h
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: ambigs.h |
3 | | // Description: Constants, flags, functions for dealing with |
4 | | // ambiguities (training and recognition). |
5 | | // Author: Daria Antonova |
6 | | // |
7 | | // (C) Copyright 2008, Google Inc. |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | // you may not use this file except in compliance with the License. |
10 | | // You may obtain a copy of the License at |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // Unless required by applicable law or agreed to in writing, software |
13 | | // distributed under the License is distributed on an "AS IS" BASIS, |
14 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | // See the License for the specific language governing permissions and |
16 | | // limitations under the License. |
17 | | // |
18 | | /////////////////////////////////////////////////////////////////////// |
19 | | |
20 | | #ifndef TESSERACT_CCUTIL_AMBIGS_H_ |
21 | | #define TESSERACT_CCUTIL_AMBIGS_H_ |
22 | | |
23 | | #ifdef HAVE_CONFIG_H |
24 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
25 | | #endif |
26 | | |
27 | | #if !defined(DISABLED_LEGACY_ENGINE) |
28 | | |
29 | | # include <tesseract/unichar.h> |
30 | | # include "elst.h" |
31 | | # include "tprintf.h" |
32 | | # include "unicharset.h" |
33 | | |
34 | 152k | # define MAX_AMBIG_SIZE 10 |
35 | | |
36 | | namespace tesseract { |
37 | | |
38 | | using UnicharIdVector = std::vector<UNICHAR_ID>; |
39 | | |
40 | | enum AmbigType { |
41 | | NOT_AMBIG, // the ngram pair is not ambiguous |
42 | | REPLACE_AMBIG, // ocred ngram should always be substituted with correct |
43 | | DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) |
44 | | SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) |
45 | | CASE_AMBIG, // this is a case ambiguity (1-1) |
46 | | |
47 | | AMBIG_TYPE_COUNT // number of enum entries |
48 | | }; |
49 | | |
50 | | // A collection of utility functions for arrays of UNICHAR_IDs that are |
51 | | // terminated by INVALID_UNICHAR_ID. |
52 | | class UnicharIdArrayUtils { |
53 | | public: |
54 | | // Compares two arrays of unichar ids. Returns -1 if the length of array1 is |
55 | | // less than length of array2, if any array1[i] is less than array2[i]. |
56 | | // Returns 0 if the arrays are equal, 1 otherwise. |
57 | | // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. |
58 | 593M | static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) { |
59 | 1.19G | for (;;) { |
60 | 1.19G | const UNICHAR_ID val1 = *ptr1++; |
61 | 1.19G | const UNICHAR_ID val2 = *ptr2++; |
62 | 1.19G | if (val1 != val2) { |
63 | 590M | if (val1 == INVALID_UNICHAR_ID) { |
64 | 5.67M | return -1; |
65 | 5.67M | } |
66 | 585M | if (val2 == INVALID_UNICHAR_ID) { |
67 | 116 | return 1; |
68 | 116 | } |
69 | 585M | if (val1 < val2) { |
70 | 14.4M | return -1; |
71 | 14.4M | } |
72 | 570M | return 1; |
73 | 585M | } |
74 | 600M | if (val1 == INVALID_UNICHAR_ID) { |
75 | 2.09M | return 0; |
76 | 2.09M | } |
77 | 600M | } |
78 | 593M | } |
79 | | |
80 | | // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. |
81 | | // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID |
82 | | // and that dst has enough space for all the elements from src. |
83 | 76.1k | static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { |
84 | 76.1k | int i = 0; |
85 | 303k | do { |
86 | 303k | dst[i] = src[i]; |
87 | 303k | } while (dst[i++] != INVALID_UNICHAR_ID); |
88 | 76.1k | return i - 1; |
89 | 76.1k | } |
90 | | |
91 | | // Prints unichars corresponding to the unichar_ids in the given array. |
92 | | // The function assumes that array is terminated by INVALID_UNICHAR_ID. |
93 | 0 | static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) { |
94 | 0 | const UNICHAR_ID *ptr = array; |
95 | 0 | if (*ptr == INVALID_UNICHAR_ID) { |
96 | 0 | tprintf("[Empty]"); |
97 | 0 | } |
98 | 0 | while (*ptr != INVALID_UNICHAR_ID) { |
99 | 0 | tprintf("%s ", unicharset.id_to_unichar(*ptr++)); |
100 | 0 | } |
101 | 0 | tprintf("( "); |
102 | 0 | ptr = array; |
103 | 0 | while (*ptr != INVALID_UNICHAR_ID) { |
104 | 0 | tprintf("%d ", *ptr++); |
105 | 0 | } |
106 | 0 | tprintf(")\n"); |
107 | 0 | } |
108 | | }; |
109 | | |
110 | | // AMBIG_SPEC_LIST stores a list of dangerous ambigs that |
111 | | // start with the same unichar (e.g. r->t rn->m rr1->m). |
112 | | class AmbigSpec : public ELIST<AmbigSpec>::LINK { |
113 | | public: |
114 | | AmbigSpec(); |
115 | | ~AmbigSpec() = default; |
116 | | |
117 | | // Comparator function for sorting AmbigSpec_LISTs. The lists will |
118 | | // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors |
119 | | // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. |
120 | 10.5M | static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) { |
121 | 10.5M | int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); |
122 | 10.5M | if (result != 0) { |
123 | 10.5M | return result; |
124 | 10.5M | } |
125 | 436 | return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments); |
126 | 10.5M | } |
127 | | |
128 | | UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; |
129 | | UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; |
130 | | UNICHAR_ID correct_ngram_id; |
131 | | AmbigType type; |
132 | | int wrong_ngram_size; |
133 | | }; |
134 | | ELISTIZEH(AmbigSpec) |
135 | | |
136 | | // AMBIG_TABLE[i] stores a set of ambiguities whose |
137 | | // wrong ngram starts with unichar id i. |
138 | | using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>; |
139 | | |
140 | | class UnicharAmbigs { |
141 | | public: |
142 | 8 | UnicharAmbigs() = default; |
143 | 0 | ~UnicharAmbigs() { |
144 | 0 | for (auto data : replace_ambigs_) { |
145 | 0 | delete data; |
146 | 0 | } |
147 | 0 | for (auto data : dang_ambigs_) { |
148 | 0 | delete data; |
149 | 0 | } |
150 | 0 | for (auto data : one_to_one_definite_ambigs_) { |
151 | 0 | delete data; |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | 817k | const UnicharAmbigsVector &dang_ambigs() const { |
156 | 817k | return dang_ambigs_; |
157 | 817k | } |
158 | 817k | const UnicharAmbigsVector &replace_ambigs() const { |
159 | 817k | return replace_ambigs_; |
160 | 817k | } |
161 | | |
162 | | // Initializes the ambigs by adding a nullptr pointer to each table. |
163 | | void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption); |
164 | | |
165 | | // Loads the universal ambigs that are useful for any language. |
166 | | void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset); |
167 | | |
168 | | // Fills in two ambiguity tables (replaceable and dangerous) with information |
169 | | // read from the ambigs file. An ambiguity table is an array of lists. |
170 | | // The array is indexed by a class id. Each entry in the table provides |
171 | | // a list of potential ambiguities which can start with the corresponding |
172 | | // character. For example the ambiguity "rn -> m", would be located in the |
173 | | // table at index of unicharset.unichar_to_id('r'). |
174 | | // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in |
175 | | // one_to_one_definite_ambigs_. This vector is also indexed by the class id |
176 | | // of the wrong part of the ambiguity and each entry contains a vector of |
177 | | // unichar ids that are ambiguous to it. |
178 | | // encoder_set is used to encode the ambiguity strings, undisturbed by new |
179 | | // unichar_ids that may be created by adding the ambigs. |
180 | | void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, |
181 | | bool use_ambigs_for_adaption, UNICHARSET *unicharset); |
182 | | |
183 | | // Returns definite 1-1 ambigs for the given unichar id. |
184 | 0 | inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const { |
185 | 0 | if (one_to_one_definite_ambigs_.empty()) { |
186 | 0 | return nullptr; |
187 | 0 | } |
188 | 0 | return one_to_one_definite_ambigs_[unichar_id]; |
189 | 0 | } |
190 | | |
191 | | // Returns a pointer to the vector with all unichar ids that appear in the |
192 | | // 'correct' part of the ambiguity pair when the given unichar id appears |
193 | | // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of |
194 | | // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of |
195 | | // m will return a pointer to a vector with unichar ids of r,n,i. |
196 | 0 | inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const { |
197 | 0 | if (ambigs_for_adaption_.empty()) { |
198 | 0 | return nullptr; |
199 | 0 | } |
200 | 0 | return ambigs_for_adaption_[unichar_id]; |
201 | 0 | } |
202 | | |
203 | | // Similar to the above, but return the vector of unichar ids for which |
204 | | // the given unichar_id is an ambiguity (appears in the 'wrong' part of |
205 | | // some ambiguity pair). |
206 | 97 | inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const { |
207 | 97 | if (reverse_ambigs_for_adaption_.empty()) { |
208 | 97 | return nullptr; |
209 | 97 | } |
210 | 0 | return reverse_ambigs_for_adaption_[unichar_id]; |
211 | 97 | } |
212 | | |
213 | | private: |
214 | | bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset, |
215 | | char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, |
216 | | int *replacement_ambig_part_size, char *replacement_string, int *type); |
217 | | bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size, |
218 | | UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, |
219 | | const char *replacement_string, int type, AmbigSpec *ambig_spec, |
220 | | UNICHARSET *unicharset); |
221 | | |
222 | | UnicharAmbigsVector dang_ambigs_; |
223 | | UnicharAmbigsVector replace_ambigs_; |
224 | | std::vector<UnicharIdVector *> one_to_one_definite_ambigs_; |
225 | | std::vector<UnicharIdVector *> ambigs_for_adaption_; |
226 | | std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_; |
227 | | }; |
228 | | |
229 | | } // namespace tesseract |
230 | | |
231 | | #endif // !defined(DISABLED_LEGACY_ENGINE) |
232 | | |
233 | | #endif // TESSERACT_CCUTIL_AMBIGS_H_ |