/src/tesseract/src/classify/clusttool.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | ** Filename: clusttool.cpp |
3 | | ** Purpose: Misc. tools for use with the clustering routines |
4 | | ** Author: Dan Johnson |
5 | | ** |
6 | | ** (c) Copyright Hewlett-Packard Company, 1988. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | *****************************************************************************/ |
17 | | |
18 | | #define _USE_MATH_DEFINES // for M_PI |
19 | | |
20 | | #include "clusttool.h" |
21 | | |
22 | | #include <cmath> // for M_PI, std::isnan |
23 | | #include <locale> // for std::locale::classic |
24 | | #include <sstream> // for std::stringstream |
25 | | |
26 | | namespace tesseract { |
27 | | |
28 | | //---------------Global Data Definitions and Declarations-------------------- |
29 | 456 | #define TOKENSIZE 80 ///< max size of tokens read from an input file |
30 | | #define QUOTED_TOKENSIZE "79" |
31 | | #define MAXSAMPLESIZE 65535 ///< max num of dimensions in feature space |
32 | | |
33 | | /** |
34 | | * This routine reads N floats from the specified text file |
35 | | * and places them into Buffer. If Buffer is nullptr, a buffer |
36 | | * is created and passed back to the caller. If EOF is |
37 | | * encountered before any floats can be read, nullptr is |
38 | | * returned. |
39 | | * @param fp open text file to read floats from |
40 | | * @param N number of floats to read |
41 | | * @param Buffer pointer to buffer to place floats into |
42 | | * @return Pointer to buffer holding floats or nullptr if EOF |
43 | | * @note Globals: None |
44 | | */ |
45 | 880 | static bool ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) { |
46 | 880 | const int kMaxLineSize = 1024; |
47 | 880 | char line[kMaxLineSize]; |
48 | 880 | if (fp->FGets(line, kMaxLineSize) == nullptr) { |
49 | 0 | tprintf("Hit EOF in ReadNFloats!\n"); |
50 | 0 | return false; |
51 | 0 | } |
52 | | |
53 | 880 | std::stringstream stream(line); |
54 | | // Use "C" locale (needed for float values Buffer[i]). |
55 | 880 | stream.imbue(std::locale::classic()); |
56 | 4.40k | for (uint16_t i = 0; i < N; i++) { |
57 | 3.52k | float f = NAN; |
58 | 3.52k | stream >> f; |
59 | 3.52k | if (std::isnan(f)) { |
60 | 0 | tprintf("Read of %u floats failed!\n", N); |
61 | 0 | return false; |
62 | 0 | } |
63 | 3.52k | Buffer[i] = f; |
64 | 3.52k | } |
65 | 880 | return true; |
66 | 880 | } |
67 | | |
68 | | /** |
69 | | * This routine writes a text representation of N floats from |
70 | | * an array to a file. All of the floats are placed on one line. |
71 | | * @param File open text file to write N floats to |
72 | | * @param N number of floats to write |
73 | | * @param Array array of floats to write |
74 | | */ |
75 | 0 | static void WriteNFloats(FILE *File, uint16_t N, float Array[]) { |
76 | 0 | for (int i = 0; i < N; i++) { |
77 | 0 | fprintf(File, " %9.6f", Array[i]); |
78 | 0 | } |
79 | 0 | fprintf(File, "\n"); |
80 | 0 | } |
81 | | |
82 | | /** |
83 | | * This routine writes to the specified text file a word |
84 | | * which represents the ProtoStyle. It does not append |
85 | | * a carriage return to the end. |
86 | | * @param File open text file to write prototype style to |
87 | | * @param ProtoStyle prototype style to write |
88 | | */ |
89 | 0 | static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) { |
90 | 0 | switch (ProtoStyle) { |
91 | 0 | case spherical: |
92 | 0 | fprintf(File, "spherical"); |
93 | 0 | break; |
94 | 0 | case elliptical: |
95 | 0 | fprintf(File, "elliptical"); |
96 | 0 | break; |
97 | 0 | case mixed: |
98 | 0 | fprintf(File, "mixed"); |
99 | 0 | break; |
100 | 0 | case automatic: |
101 | 0 | fprintf(File, "automatic"); |
102 | 0 | break; |
103 | 0 | } |
104 | 0 | } |
105 | | |
106 | | /** |
107 | | * This routine reads a single integer from the specified |
108 | | * file and checks to ensure that it is between 0 and |
109 | | * MAXSAMPLESIZE. |
110 | | * @param fp open text file to read sample size from |
111 | | * @return Sample size |
112 | | * @note Globals: None |
113 | | */ |
114 | 4 | uint16_t ReadSampleSize(TFile *fp) { |
115 | 4 | int SampleSize = 0; |
116 | | |
117 | 4 | const int kMaxLineSize = 100; |
118 | 4 | char line[kMaxLineSize]; |
119 | 4 | ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr); |
120 | 4 | ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1); |
121 | 4 | ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE); |
122 | 4 | return SampleSize; |
123 | 4 | } |
124 | | |
125 | | /** |
126 | | * This routine reads textual descriptions of sets of parameters |
127 | | * which describe the characteristics of feature dimensions. |
128 | | * |
129 | | * @param fp open text file to read N parameter descriptions from |
130 | | * @param N number of parameter descriptions to read |
131 | | * @return Pointer to an array of parameter descriptors. |
132 | | * @note Globals: None |
133 | | */ |
134 | 4 | PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) { |
135 | 4 | auto ParamDesc = new PARAM_DESC[N]; |
136 | 20 | for (int i = 0; i < N; i++) { |
137 | 16 | const int kMaxLineSize = TOKENSIZE * 4; |
138 | 16 | char line[kMaxLineSize]; |
139 | 16 | ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr); |
140 | 16 | std::istringstream stream(line); |
141 | | // Use "C" locale (needed for float values Min, Max). |
142 | 16 | stream.imbue(std::locale::classic()); |
143 | 16 | std::string linear_token; |
144 | 16 | stream >> linear_token; |
145 | 16 | std::string essential_token; |
146 | 16 | stream >> essential_token; |
147 | 16 | stream >> ParamDesc[i].Min; |
148 | 16 | stream >> ParamDesc[i].Max; |
149 | 16 | ASSERT_HOST(!stream.fail()); |
150 | 16 | ParamDesc[i].Circular = (linear_token[0] == 'c'); |
151 | 16 | ParamDesc[i].NonEssential = (essential_token[0] != 'e'); |
152 | 16 | ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; |
153 | 16 | ParamDesc[i].HalfRange = ParamDesc[i].Range / 2; |
154 | 16 | ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2; |
155 | 16 | } |
156 | 4 | return (ParamDesc); |
157 | 4 | } |
158 | | |
159 | | /** |
160 | | * This routine reads a textual description of a prototype from |
161 | | * the specified file. |
162 | | * |
163 | | * @param fp open text file to read prototype from |
164 | | * @param N number of dimensions used in prototype |
165 | | * @return List of prototypes |
166 | | * @note Globals: None |
167 | | */ |
168 | 440 | PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) { |
169 | 440 | char sig_token[TOKENSIZE], shape_token[TOKENSIZE]; |
170 | 440 | int SampleCount; |
171 | 440 | int i; |
172 | | |
173 | 440 | const int kMaxLineSize = TOKENSIZE * 4; |
174 | 440 | char line[kMaxLineSize]; |
175 | 440 | if (fp->FGets(line, kMaxLineSize) == nullptr || |
176 | 440 | sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d", sig_token, shape_token, |
177 | 440 | &SampleCount) != 3) { |
178 | 0 | tprintf("Invalid prototype: %s\n", line); |
179 | 0 | return nullptr; |
180 | 0 | } |
181 | 440 | auto Proto = new PROTOTYPE; |
182 | 440 | Proto->Cluster = nullptr; |
183 | 440 | Proto->Significant = (sig_token[0] == 's'); |
184 | | |
185 | 440 | switch (shape_token[0]) { |
186 | 0 | case 's': |
187 | 0 | Proto->Style = spherical; |
188 | 0 | break; |
189 | 440 | case 'e': |
190 | 440 | Proto->Style = elliptical; |
191 | 440 | break; |
192 | 0 | case 'a': |
193 | 0 | Proto->Style = automatic; |
194 | 0 | break; |
195 | 0 | default: |
196 | 0 | tprintf("Invalid prototype style specification:%s\n", shape_token); |
197 | 0 | Proto->Style = elliptical; |
198 | 440 | } |
199 | | |
200 | 440 | ASSERT_HOST(SampleCount >= 0); |
201 | 440 | Proto->NumSamples = SampleCount; |
202 | | |
203 | 440 | Proto->Mean.resize(N); |
204 | 440 | ReadNFloats(fp, N, &Proto->Mean[0]); |
205 | | |
206 | 440 | switch (Proto->Style) { |
207 | 0 | case spherical: |
208 | 0 | ReadNFloats(fp, 1, &(Proto->Variance.Spherical)); |
209 | 0 | Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical); |
210 | 0 | Proto->TotalMagnitude = std::pow(Proto->Magnitude.Spherical, static_cast<float>(N)); |
211 | 0 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
212 | 0 | Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; |
213 | 0 | Proto->Distrib.clear(); |
214 | 0 | break; |
215 | 440 | case elliptical: |
216 | 440 | Proto->Variance.Elliptical = new float[N]; |
217 | 440 | ReadNFloats(fp, N, Proto->Variance.Elliptical); |
218 | 440 | Proto->Magnitude.Elliptical = new float[N]; |
219 | 440 | Proto->Weight.Elliptical = new float[N]; |
220 | 440 | Proto->TotalMagnitude = 1.0; |
221 | 2.20k | for (i = 0; i < N; i++) { |
222 | 1.76k | Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]); |
223 | 1.76k | Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i]; |
224 | 1.76k | Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; |
225 | 1.76k | } |
226 | 440 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
227 | 440 | Proto->Distrib.clear(); |
228 | 440 | break; |
229 | 0 | default: |
230 | 0 | delete Proto; |
231 | 0 | tprintf("Invalid prototype style\n"); |
232 | 0 | return nullptr; |
233 | 440 | } |
234 | 440 | return Proto; |
235 | 440 | } |
236 | | |
237 | | /** |
238 | | * This routine writes an array of dimension descriptors to |
239 | | * the specified text file. |
240 | | * @param File open text file to write param descriptors to |
241 | | * @param N number of param descriptors to write |
242 | | * @param ParamDesc array of param descriptors to write |
243 | | */ |
244 | 0 | void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) { |
245 | 0 | int i; |
246 | |
|
247 | 0 | for (i = 0; i < N; i++) { |
248 | 0 | if (ParamDesc[i].Circular) { |
249 | 0 | fprintf(File, "circular "); |
250 | 0 | } else { |
251 | 0 | fprintf(File, "linear "); |
252 | 0 | } |
253 | |
|
254 | 0 | if (ParamDesc[i].NonEssential) { |
255 | 0 | fprintf(File, "non-essential "); |
256 | 0 | } else { |
257 | 0 | fprintf(File, "essential "); |
258 | 0 | } |
259 | |
|
260 | 0 | fprintf(File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max); |
261 | 0 | } |
262 | 0 | } |
263 | | |
264 | | /** |
265 | | * This routine writes a textual description of a prototype |
266 | | * to the specified text file. |
267 | | * @param File open text file to write prototype to |
268 | | * @param N number of dimensions in feature space |
269 | | * @param Proto prototype to write out |
270 | | */ |
271 | 0 | void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) { |
272 | 0 | int i; |
273 | |
|
274 | 0 | if (Proto->Significant) { |
275 | 0 | fprintf(File, "significant "); |
276 | 0 | } else { |
277 | 0 | fprintf(File, "insignificant "); |
278 | 0 | } |
279 | 0 | WriteProtoStyle(File, static_cast<PROTOSTYLE>(Proto->Style)); |
280 | 0 | fprintf(File, "%6d\n\t", Proto->NumSamples); |
281 | 0 | WriteNFloats(File, N, &Proto->Mean[0]); |
282 | 0 | fprintf(File, "\t"); |
283 | |
|
284 | 0 | switch (Proto->Style) { |
285 | 0 | case spherical: |
286 | 0 | WriteNFloats(File, 1, &(Proto->Variance.Spherical)); |
287 | 0 | break; |
288 | 0 | case elliptical: |
289 | 0 | WriteNFloats(File, N, Proto->Variance.Elliptical); |
290 | 0 | break; |
291 | 0 | case mixed: |
292 | 0 | for (i = 0; i < N; i++) { |
293 | 0 | switch (Proto->Distrib[i]) { |
294 | 0 | case normal: |
295 | 0 | fprintf(File, " %9s", "normal"); |
296 | 0 | break; |
297 | 0 | case uniform: |
298 | 0 | fprintf(File, " %9s", "uniform"); |
299 | 0 | break; |
300 | 0 | case D_random: |
301 | 0 | fprintf(File, " %9s", "random"); |
302 | 0 | break; |
303 | 0 | case DISTRIBUTION_COUNT: |
304 | 0 | ASSERT_HOST(!"Distribution count not allowed!"); |
305 | 0 | } |
306 | 0 | } |
307 | 0 | fprintf(File, "\n\t"); |
308 | 0 | WriteNFloats(File, N, Proto->Variance.Elliptical); |
309 | 0 | } |
310 | 0 | } |
311 | | |
312 | | } // namespace tesseract |