/src/tesseract/src/classify/normmatch.cpp
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | ** Filename: normmatch.c |
3 | | ** Purpose: Simple matcher based on character normalization features. |
4 | | ** Author: Dan Johnson |
5 | | ** |
6 | | ** (c) Copyright Hewlett-Packard Company, 1988. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | ******************************************************************************/ |
17 | | /*---------------------------------------------------------------------------- |
18 | | Include Files and Type Defines |
19 | | ----------------------------------------------------------------------------*/ |
20 | | #include "normmatch.h" |
21 | | |
22 | | #include "classify.h" |
23 | | #include "clusttool.h" |
24 | | #include "helpers.h" |
25 | | #include "normfeat.h" |
26 | | #include "params.h" |
27 | | #include "unicharset.h" |
28 | | |
29 | | #include <cmath> |
30 | | #include <cstdio> |
31 | | #include <sstream> // for std::istringstream |
32 | | |
33 | | namespace tesseract { |
34 | | |
35 | | struct NORM_PROTOS { |
36 | 4 | NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) { |
37 | 4 | } |
38 | | int NumParams = 0; |
39 | | int NumProtos; |
40 | | PARAM_DESC *ParamDesc = nullptr; |
41 | | std::vector<LIST> Protos; |
42 | | }; |
43 | | |
44 | | /*---------------------------------------------------------------------------- |
45 | | Private Code |
46 | | ----------------------------------------------------------------------------*/ |
47 | | |
48 | | /** |
49 | | * @name NormEvidenceOf |
50 | | * |
51 | | * Return the new type of evidence number corresponding to this |
52 | | * normalization adjustment. The equation that represents the transform is: |
53 | | * 1 / (1 + (NormAdj / midpoint) ^ curl) |
54 | | */ |
55 | 205M | static float NormEvidenceOf(float NormAdj) { |
56 | 205M | NormAdj /= static_cast<float>(classify_norm_adj_midpoint); |
57 | | |
58 | 205M | if (classify_norm_adj_curl == 3) { |
59 | 0 | NormAdj = NormAdj * NormAdj * NormAdj; |
60 | 205M | } else if (classify_norm_adj_curl == 2) { |
61 | 205M | NormAdj = NormAdj * NormAdj; |
62 | 205M | } else { |
63 | 0 | NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl)); |
64 | 0 | } |
65 | 205M | return (1 / (1 + NormAdj)); |
66 | 205M | } |
67 | | |
68 | | /*---------------------------------------------------------------------------- |
69 | | Variables |
70 | | ----------------------------------------------------------------------------*/ |
71 | | |
72 | | /** control knobs used to control the normalization adjustment process */ |
73 | | double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ..."); |
74 | | double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); |
75 | | /** Weight of width variance against height and vertical position. */ |
76 | | const float kWidthErrorWeighting = 0.125f; |
77 | | |
78 | | /*---------------------------------------------------------------------------- |
79 | | Public Code |
80 | | ----------------------------------------------------------------------------*/ |
81 | | /** |
82 | | * This routine compares Features against each character |
83 | | * normalization proto for ClassId and returns the match |
84 | | * rating of the best match. |
85 | | * @param ClassId id of class to match against |
86 | | * @param feature character normalization feature |
87 | | * @param DebugMatch controls dump of debug info |
88 | | * |
89 | | * Globals: |
90 | | * #NormProtos character normalization prototypes |
91 | | * |
92 | | * @return Best match rating for Feature against protos of ClassId. |
93 | | */ |
94 | 209M | float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) { |
95 | 209M | if (ClassId >= NormProtos->NumProtos) { |
96 | 0 | ClassId = NO_CLASS; |
97 | 0 | } |
98 | | |
99 | | /* handle requests for classification as noise */ |
100 | 209M | if (ClassId == NO_CLASS) { |
101 | | /* kludge - clean up constants and make into control knobs later */ |
102 | 1.85M | float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f + |
103 | 1.85M | feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f + |
104 | 1.85M | feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f); |
105 | 1.85M | return (1 - NormEvidenceOf(Match)); |
106 | 1.85M | } |
107 | | |
108 | 207M | if (DebugMatch) { |
109 | 0 | tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId)); |
110 | 0 | } |
111 | | |
112 | 207M | LIST Protos = NormProtos->Protos[ClassId]; |
113 | 207M | if (Protos == nullptr) { |
114 | | // Avoid FP overflow in NormEvidenceOf. |
115 | 3.70M | return 1.0f; |
116 | 3.70M | } |
117 | | |
118 | 203M | float BestMatch = FLT_MAX; |
119 | 203M | iterate(Protos) { |
120 | 203M | auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node()); |
121 | 203M | float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY]; |
122 | 203M | float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY]; |
123 | 203M | if (DebugMatch) { |
124 | 0 | tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta, |
125 | 0 | Proto->Weight.Elliptical[CharNormY], Match); |
126 | 0 | } |
127 | 203M | Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx]; |
128 | 203M | Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx]; |
129 | 203M | if (DebugMatch) { |
130 | 0 | tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta, |
131 | 0 | Proto->Weight.Elliptical[CharNormRx], Match); |
132 | 0 | } |
133 | | // Ry is width! See intfx.cpp. |
134 | 203M | Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy]; |
135 | 203M | if (DebugMatch) { |
136 | 0 | tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta, |
137 | 0 | Proto->Weight.Elliptical[CharNormRy]); |
138 | 0 | } |
139 | 203M | Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy]; |
140 | 203M | Delta *= kWidthErrorWeighting; |
141 | 203M | Match += Delta; |
142 | 203M | if (DebugMatch) { |
143 | 0 | tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match, |
144 | 0 | Match / classify_norm_adj_midpoint, NormEvidenceOf(Match), |
145 | 0 | 256 * (1 - NormEvidenceOf(Match))); |
146 | 0 | } |
147 | | |
148 | 203M | if (Match < BestMatch) { |
149 | 203M | BestMatch = Match; |
150 | 203M | } |
151 | 203M | } |
152 | 203M | return 1 - NormEvidenceOf(BestMatch); |
153 | 207M | } /* ComputeNormMatch */ |
154 | | |
155 | 0 | void Classify::FreeNormProtos() { |
156 | 0 | if (NormProtos != nullptr) { |
157 | 0 | for (int i = 0; i < NormProtos->NumProtos; i++) { |
158 | 0 | FreeProtoList(&NormProtos->Protos[i]); |
159 | 0 | } |
160 | 0 | delete[] NormProtos->ParamDesc; |
161 | 0 | delete NormProtos; |
162 | 0 | NormProtos = nullptr; |
163 | 0 | } |
164 | 0 | } |
165 | | |
166 | | /** |
167 | | * This routine allocates a new data structure to hold |
168 | | * a set of character normalization protos. It then fills in |
169 | | * the data structure by reading from the specified File. |
170 | | * @param fp open text file to read normalization protos from |
171 | | * Globals: none |
172 | | * @return Character normalization protos. |
173 | | */ |
174 | 4 | NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) { |
175 | 4 | char unichar[2 * UNICHAR_LEN + 1]; |
176 | 4 | UNICHAR_ID unichar_id; |
177 | 4 | LIST Protos; |
178 | 4 | int NumProtos; |
179 | | |
180 | | /* allocate and initialization data structure */ |
181 | 4 | auto NormProtos = new NORM_PROTOS(unicharset.size()); |
182 | | |
183 | | /* read file header and save in data structure */ |
184 | 4 | NormProtos->NumParams = ReadSampleSize(fp); |
185 | 4 | NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams); |
186 | | |
187 | | /* read protos for each class into a separate list */ |
188 | 4 | const int kMaxLineSize = 100; |
189 | 4 | char line[kMaxLineSize]; |
190 | 884 | while (fp->FGets(line, kMaxLineSize) != nullptr) { |
191 | 880 | std::istringstream stream(line); |
192 | 880 | stream.imbue(std::locale::classic()); |
193 | 880 | stream >> unichar >> NumProtos; |
194 | 880 | if (stream.fail()) { |
195 | 440 | continue; |
196 | 440 | } |
197 | 440 | if (unicharset.contains_unichar(unichar)) { |
198 | 440 | unichar_id = unicharset.unichar_to_id(unichar); |
199 | 440 | Protos = NormProtos->Protos[unichar_id]; |
200 | 880 | for (int i = 0; i < NumProtos; i++) { |
201 | 440 | Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams)); |
202 | 440 | } |
203 | 440 | NormProtos->Protos[unichar_id] = Protos; |
204 | 440 | } else { |
205 | 0 | tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar); |
206 | 0 | for (int i = 0; i < NumProtos; i++) { |
207 | 0 | FreePrototype(ReadPrototype(fp, NormProtos->NumParams)); |
208 | 0 | } |
209 | 0 | } |
210 | 440 | } |
211 | 4 | return NormProtos; |
212 | 4 | } /* ReadNormProtos */ |
213 | | |
214 | | } // namespace tesseract |