/src/tesseract/src/classify/cluster.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | ** Filename: cluster.cpp |
3 | | ** Purpose: Routines for clustering points in N-D space |
4 | | ** Author: Dan Johnson |
5 | | ** |
6 | | ** (c) Copyright Hewlett-Packard Company, 1988. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | *****************************************************************************/ |
17 | | |
18 | | #define _USE_MATH_DEFINES // for M_PI |
19 | | |
20 | | #include "cluster.h" |
21 | | |
22 | | #include "genericheap.h" |
23 | | #include "kdpair.h" |
24 | | #include "matrix.h" |
25 | | #include "tprintf.h" |
26 | | |
27 | | #include "helpers.h" |
28 | | |
29 | | #include <cfloat> // for FLT_MAX |
30 | | #include <cmath> // for M_PI |
31 | | #include <vector> // for std::vector |
32 | | |
33 | | namespace tesseract { |
34 | | |
35 | 0 | #define HOTELLING 1 // If true use Hotelling's test to decide where to split. |
36 | 0 | #define FTABLE_X 10 // Size of FTable. |
37 | 0 | #define FTABLE_Y 100 // Size of FTable. |
38 | | |
39 | | // Table of values approximating the cumulative F-distribution for a confidence |
40 | | // of 1%. |
41 | | const double FTable[FTABLE_Y][FTABLE_X] = { |
42 | | { |
43 | | 4052.19, |
44 | | 4999.52, |
45 | | 5403.34, |
46 | | 5624.62, |
47 | | 5763.65, |
48 | | 5858.97, |
49 | | 5928.33, |
50 | | 5981.10, |
51 | | 6022.50, |
52 | | 6055.85, |
53 | | }, |
54 | | { |
55 | | 98.502, |
56 | | 99.000, |
57 | | 99.166, |
58 | | 99.249, |
59 | | 99.300, |
60 | | 99.333, |
61 | | 99.356, |
62 | | 99.374, |
63 | | 99.388, |
64 | | 99.399, |
65 | | }, |
66 | | { |
67 | | 34.116, |
68 | | 30.816, |
69 | | 29.457, |
70 | | 28.710, |
71 | | 28.237, |
72 | | 27.911, |
73 | | 27.672, |
74 | | 27.489, |
75 | | 27.345, |
76 | | 27.229, |
77 | | }, |
78 | | { |
79 | | 21.198, |
80 | | 18.000, |
81 | | 16.694, |
82 | | 15.977, |
83 | | 15.522, |
84 | | 15.207, |
85 | | 14.976, |
86 | | 14.799, |
87 | | 14.659, |
88 | | 14.546, |
89 | | }, |
90 | | { |
91 | | 16.258, |
92 | | 13.274, |
93 | | 12.060, |
94 | | 11.392, |
95 | | 10.967, |
96 | | 10.672, |
97 | | 10.456, |
98 | | 10.289, |
99 | | 10.158, |
100 | | 10.051, |
101 | | }, |
102 | | { |
103 | | 13.745, |
104 | | 10.925, |
105 | | 9.780, |
106 | | 9.148, |
107 | | 8.746, |
108 | | 8.466, |
109 | | 8.260, |
110 | | 8.102, |
111 | | 7.976, |
112 | | 7.874, |
113 | | }, |
114 | | { |
115 | | 12.246, |
116 | | 9.547, |
117 | | 8.451, |
118 | | 7.847, |
119 | | 7.460, |
120 | | 7.191, |
121 | | 6.993, |
122 | | 6.840, |
123 | | 6.719, |
124 | | 6.620, |
125 | | }, |
126 | | { |
127 | | 11.259, |
128 | | 8.649, |
129 | | 7.591, |
130 | | 7.006, |
131 | | 6.632, |
132 | | 6.371, |
133 | | 6.178, |
134 | | 6.029, |
135 | | 5.911, |
136 | | 5.814, |
137 | | }, |
138 | | { |
139 | | 10.561, |
140 | | 8.022, |
141 | | 6.992, |
142 | | 6.422, |
143 | | 6.057, |
144 | | 5.802, |
145 | | 5.613, |
146 | | 5.467, |
147 | | 5.351, |
148 | | 5.257, |
149 | | }, |
150 | | { |
151 | | 10.044, |
152 | | 7.559, |
153 | | 6.552, |
154 | | 5.994, |
155 | | 5.636, |
156 | | 5.386, |
157 | | 5.200, |
158 | | 5.057, |
159 | | 4.942, |
160 | | 4.849, |
161 | | }, |
162 | | { |
163 | | 9.646, |
164 | | 7.206, |
165 | | 6.217, |
166 | | 5.668, |
167 | | 5.316, |
168 | | 5.069, |
169 | | 4.886, |
170 | | 4.744, |
171 | | 4.632, |
172 | | 4.539, |
173 | | }, |
174 | | { |
175 | | 9.330, |
176 | | 6.927, |
177 | | 5.953, |
178 | | 5.412, |
179 | | 5.064, |
180 | | 4.821, |
181 | | 4.640, |
182 | | 4.499, |
183 | | 4.388, |
184 | | 4.296, |
185 | | }, |
186 | | { |
187 | | 9.074, |
188 | | 6.701, |
189 | | 5.739, |
190 | | 5.205, |
191 | | 4.862, |
192 | | 4.620, |
193 | | 4.441, |
194 | | 4.302, |
195 | | 4.191, |
196 | | 4.100, |
197 | | }, |
198 | | { |
199 | | 8.862, |
200 | | 6.515, |
201 | | 5.564, |
202 | | 5.035, |
203 | | 4.695, |
204 | | 4.456, |
205 | | 4.278, |
206 | | 4.140, |
207 | | 4.030, |
208 | | 3.939, |
209 | | }, |
210 | | { |
211 | | 8.683, |
212 | | 6.359, |
213 | | 5.417, |
214 | | 4.893, |
215 | | 4.556, |
216 | | 4.318, |
217 | | 4.142, |
218 | | 4.004, |
219 | | 3.895, |
220 | | 3.805, |
221 | | }, |
222 | | { |
223 | | 8.531, |
224 | | 6.226, |
225 | | 5.292, |
226 | | 4.773, |
227 | | 4.437, |
228 | | 4.202, |
229 | | 4.026, |
230 | | 3.890, |
231 | | 3.780, |
232 | | 3.691, |
233 | | }, |
234 | | { |
235 | | 8.400, |
236 | | 6.112, |
237 | | 5.185, |
238 | | 4.669, |
239 | | 4.336, |
240 | | 4.102, |
241 | | 3.927, |
242 | | 3.791, |
243 | | 3.682, |
244 | | 3.593, |
245 | | }, |
246 | | { |
247 | | 8.285, |
248 | | 6.013, |
249 | | 5.092, |
250 | | 4.579, |
251 | | 4.248, |
252 | | 4.015, |
253 | | 3.841, |
254 | | 3.705, |
255 | | 3.597, |
256 | | 3.508, |
257 | | }, |
258 | | { |
259 | | 8.185, |
260 | | 5.926, |
261 | | 5.010, |
262 | | 4.500, |
263 | | 4.171, |
264 | | 3.939, |
265 | | 3.765, |
266 | | 3.631, |
267 | | 3.523, |
268 | | 3.434, |
269 | | }, |
270 | | { |
271 | | 8.096, |
272 | | 5.849, |
273 | | 4.938, |
274 | | 4.431, |
275 | | 4.103, |
276 | | 3.871, |
277 | | 3.699, |
278 | | 3.564, |
279 | | 3.457, |
280 | | 3.368, |
281 | | }, |
282 | | { |
283 | | 8.017, |
284 | | 5.780, |
285 | | 4.874, |
286 | | 4.369, |
287 | | 4.042, |
288 | | 3.812, |
289 | | 3.640, |
290 | | 3.506, |
291 | | 3.398, |
292 | | 3.310, |
293 | | }, |
294 | | { |
295 | | 7.945, |
296 | | 5.719, |
297 | | 4.817, |
298 | | 4.313, |
299 | | 3.988, |
300 | | 3.758, |
301 | | 3.587, |
302 | | 3.453, |
303 | | 3.346, |
304 | | 3.258, |
305 | | }, |
306 | | { |
307 | | 7.881, |
308 | | 5.664, |
309 | | 4.765, |
310 | | 4.264, |
311 | | 3.939, |
312 | | 3.710, |
313 | | 3.539, |
314 | | 3.406, |
315 | | 3.299, |
316 | | 3.211, |
317 | | }, |
318 | | { |
319 | | 7.823, |
320 | | 5.614, |
321 | | 4.718, |
322 | | 4.218, |
323 | | 3.895, |
324 | | 3.667, |
325 | | 3.496, |
326 | | 3.363, |
327 | | 3.256, |
328 | | 3.168, |
329 | | }, |
330 | | { |
331 | | 7.770, |
332 | | 5.568, |
333 | | 4.675, |
334 | | 4.177, |
335 | | 3.855, |
336 | | 3.627, |
337 | | 3.457, |
338 | | 3.324, |
339 | | 3.217, |
340 | | 3.129, |
341 | | }, |
342 | | { |
343 | | 7.721, |
344 | | 5.526, |
345 | | 4.637, |
346 | | 4.140, |
347 | | 3.818, |
348 | | 3.591, |
349 | | 3.421, |
350 | | 3.288, |
351 | | 3.182, |
352 | | 3.094, |
353 | | }, |
354 | | { |
355 | | 7.677, |
356 | | 5.488, |
357 | | 4.601, |
358 | | 4.106, |
359 | | 3.785, |
360 | | 3.558, |
361 | | 3.388, |
362 | | 3.256, |
363 | | 3.149, |
364 | | 3.062, |
365 | | }, |
366 | | { |
367 | | 7.636, |
368 | | 5.453, |
369 | | 4.568, |
370 | | 4.074, |
371 | | 3.754, |
372 | | 3.528, |
373 | | 3.358, |
374 | | 3.226, |
375 | | 3.120, |
376 | | 3.032, |
377 | | }, |
378 | | { |
379 | | 7.598, |
380 | | 5.420, |
381 | | 4.538, |
382 | | 4.045, |
383 | | 3.725, |
384 | | 3.499, |
385 | | 3.330, |
386 | | 3.198, |
387 | | 3.092, |
388 | | 3.005, |
389 | | }, |
390 | | { |
391 | | 7.562, |
392 | | 5.390, |
393 | | 4.510, |
394 | | 4.018, |
395 | | 3.699, |
396 | | 3.473, |
397 | | 3.305, |
398 | | 3.173, |
399 | | 3.067, |
400 | | 2.979, |
401 | | }, |
402 | | { |
403 | | 7.530, |
404 | | 5.362, |
405 | | 4.484, |
406 | | 3.993, |
407 | | 3.675, |
408 | | 3.449, |
409 | | 3.281, |
410 | | 3.149, |
411 | | 3.043, |
412 | | 2.955, |
413 | | }, |
414 | | { |
415 | | 7.499, |
416 | | 5.336, |
417 | | 4.459, |
418 | | 3.969, |
419 | | 3.652, |
420 | | 3.427, |
421 | | 3.258, |
422 | | 3.127, |
423 | | 3.021, |
424 | | 2.934, |
425 | | }, |
426 | | { |
427 | | 7.471, |
428 | | 5.312, |
429 | | 4.437, |
430 | | 3.948, |
431 | | 3.630, |
432 | | 3.406, |
433 | | 3.238, |
434 | | 3.106, |
435 | | 3.000, |
436 | | 2.913, |
437 | | }, |
438 | | { |
439 | | 7.444, |
440 | | 5.289, |
441 | | 4.416, |
442 | | 3.927, |
443 | | 3.611, |
444 | | 3.386, |
445 | | 3.218, |
446 | | 3.087, |
447 | | 2.981, |
448 | | 2.894, |
449 | | }, |
450 | | { |
451 | | 7.419, |
452 | | 5.268, |
453 | | 4.396, |
454 | | 3.908, |
455 | | 3.592, |
456 | | 3.368, |
457 | | 3.200, |
458 | | 3.069, |
459 | | 2.963, |
460 | | 2.876, |
461 | | }, |
462 | | { |
463 | | 7.396, |
464 | | 5.248, |
465 | | 4.377, |
466 | | 3.890, |
467 | | 3.574, |
468 | | 3.351, |
469 | | 3.183, |
470 | | 3.052, |
471 | | 2.946, |
472 | | 2.859, |
473 | | }, |
474 | | { |
475 | | 7.373, |
476 | | 5.229, |
477 | | 4.360, |
478 | | 3.873, |
479 | | 3.558, |
480 | | 3.334, |
481 | | 3.167, |
482 | | 3.036, |
483 | | 2.930, |
484 | | 2.843, |
485 | | }, |
486 | | { |
487 | | 7.353, |
488 | | 5.211, |
489 | | 4.343, |
490 | | 3.858, |
491 | | 3.542, |
492 | | 3.319, |
493 | | 3.152, |
494 | | 3.021, |
495 | | 2.915, |
496 | | 2.828, |
497 | | }, |
498 | | { |
499 | | 7.333, |
500 | | 5.194, |
501 | | 4.327, |
502 | | 3.843, |
503 | | 3.528, |
504 | | 3.305, |
505 | | 3.137, |
506 | | 3.006, |
507 | | 2.901, |
508 | | 2.814, |
509 | | }, |
510 | | { |
511 | | 7.314, |
512 | | 5.179, |
513 | | 4.313, |
514 | | 3.828, |
515 | | 3.514, |
516 | | 3.291, |
517 | | 3.124, |
518 | | 2.993, |
519 | | 2.888, |
520 | | 2.801, |
521 | | }, |
522 | | { |
523 | | 7.296, |
524 | | 5.163, |
525 | | 4.299, |
526 | | 3.815, |
527 | | 3.501, |
528 | | 3.278, |
529 | | 3.111, |
530 | | 2.980, |
531 | | 2.875, |
532 | | 2.788, |
533 | | }, |
534 | | { |
535 | | 7.280, |
536 | | 5.149, |
537 | | 4.285, |
538 | | 3.802, |
539 | | 3.488, |
540 | | 3.266, |
541 | | 3.099, |
542 | | 2.968, |
543 | | 2.863, |
544 | | 2.776, |
545 | | }, |
546 | | { |
547 | | 7.264, |
548 | | 5.136, |
549 | | 4.273, |
550 | | 3.790, |
551 | | 3.476, |
552 | | 3.254, |
553 | | 3.087, |
554 | | 2.957, |
555 | | 2.851, |
556 | | 2.764, |
557 | | }, |
558 | | { |
559 | | 7.248, |
560 | | 5.123, |
561 | | 4.261, |
562 | | 3.778, |
563 | | 3.465, |
564 | | 3.243, |
565 | | 3.076, |
566 | | 2.946, |
567 | | 2.840, |
568 | | 2.754, |
569 | | }, |
570 | | { |
571 | | 7.234, |
572 | | 5.110, |
573 | | 4.249, |
574 | | 3.767, |
575 | | 3.454, |
576 | | 3.232, |
577 | | 3.066, |
578 | | 2.935, |
579 | | 2.830, |
580 | | 2.743, |
581 | | }, |
582 | | { |
583 | | 7.220, |
584 | | 5.099, |
585 | | 4.238, |
586 | | 3.757, |
587 | | 3.444, |
588 | | 3.222, |
589 | | 3.056, |
590 | | 2.925, |
591 | | 2.820, |
592 | | 2.733, |
593 | | }, |
594 | | { |
595 | | 7.207, |
596 | | 5.087, |
597 | | 4.228, |
598 | | 3.747, |
599 | | 3.434, |
600 | | 3.213, |
601 | | 3.046, |
602 | | 2.916, |
603 | | 2.811, |
604 | | 2.724, |
605 | | }, |
606 | | { |
607 | | 7.194, |
608 | | 5.077, |
609 | | 4.218, |
610 | | 3.737, |
611 | | 3.425, |
612 | | 3.204, |
613 | | 3.037, |
614 | | 2.907, |
615 | | 2.802, |
616 | | 2.715, |
617 | | }, |
618 | | { |
619 | | 7.182, |
620 | | 5.066, |
621 | | 4.208, |
622 | | 3.728, |
623 | | 3.416, |
624 | | 3.195, |
625 | | 3.028, |
626 | | 2.898, |
627 | | 2.793, |
628 | | 2.706, |
629 | | }, |
630 | | { |
631 | | 7.171, |
632 | | 5.057, |
633 | | 4.199, |
634 | | 3.720, |
635 | | 3.408, |
636 | | 3.186, |
637 | | 3.020, |
638 | | 2.890, |
639 | | 2.785, |
640 | | 2.698, |
641 | | }, |
642 | | { |
643 | | 7.159, |
644 | | 5.047, |
645 | | 4.191, |
646 | | 3.711, |
647 | | 3.400, |
648 | | 3.178, |
649 | | 3.012, |
650 | | 2.882, |
651 | | 2.777, |
652 | | 2.690, |
653 | | }, |
654 | | { |
655 | | 7.149, |
656 | | 5.038, |
657 | | 4.182, |
658 | | 3.703, |
659 | | 3.392, |
660 | | 3.171, |
661 | | 3.005, |
662 | | 2.874, |
663 | | 2.769, |
664 | | 2.683, |
665 | | }, |
666 | | { |
667 | | 7.139, |
668 | | 5.030, |
669 | | 4.174, |
670 | | 3.695, |
671 | | 3.384, |
672 | | 3.163, |
673 | | 2.997, |
674 | | 2.867, |
675 | | 2.762, |
676 | | 2.675, |
677 | | }, |
678 | | { |
679 | | 7.129, |
680 | | 5.021, |
681 | | 4.167, |
682 | | 3.688, |
683 | | 3.377, |
684 | | 3.156, |
685 | | 2.990, |
686 | | 2.860, |
687 | | 2.755, |
688 | | 2.668, |
689 | | }, |
690 | | { |
691 | | 7.119, |
692 | | 5.013, |
693 | | 4.159, |
694 | | 3.681, |
695 | | 3.370, |
696 | | 3.149, |
697 | | 2.983, |
698 | | 2.853, |
699 | | 2.748, |
700 | | 2.662, |
701 | | }, |
702 | | { |
703 | | 7.110, |
704 | | 5.006, |
705 | | 4.152, |
706 | | 3.674, |
707 | | 3.363, |
708 | | 3.143, |
709 | | 2.977, |
710 | | 2.847, |
711 | | 2.742, |
712 | | 2.655, |
713 | | }, |
714 | | { |
715 | | 7.102, |
716 | | 4.998, |
717 | | 4.145, |
718 | | 3.667, |
719 | | 3.357, |
720 | | 3.136, |
721 | | 2.971, |
722 | | 2.841, |
723 | | 2.736, |
724 | | 2.649, |
725 | | }, |
726 | | { |
727 | | 7.093, |
728 | | 4.991, |
729 | | 4.138, |
730 | | 3.661, |
731 | | 3.351, |
732 | | 3.130, |
733 | | 2.965, |
734 | | 2.835, |
735 | | 2.730, |
736 | | 2.643, |
737 | | }, |
738 | | { |
739 | | 7.085, |
740 | | 4.984, |
741 | | 4.132, |
742 | | 3.655, |
743 | | 3.345, |
744 | | 3.124, |
745 | | 2.959, |
746 | | 2.829, |
747 | | 2.724, |
748 | | 2.637, |
749 | | }, |
750 | | { |
751 | | 7.077, |
752 | | 4.977, |
753 | | 4.126, |
754 | | 3.649, |
755 | | 3.339, |
756 | | 3.119, |
757 | | 2.953, |
758 | | 2.823, |
759 | | 2.718, |
760 | | 2.632, |
761 | | }, |
762 | | { |
763 | | 7.070, |
764 | | 4.971, |
765 | | 4.120, |
766 | | 3.643, |
767 | | 3.333, |
768 | | 3.113, |
769 | | 2.948, |
770 | | 2.818, |
771 | | 2.713, |
772 | | 2.626, |
773 | | }, |
774 | | { |
775 | | 7.062, |
776 | | 4.965, |
777 | | 4.114, |
778 | | 3.638, |
779 | | 3.328, |
780 | | 3.108, |
781 | | 2.942, |
782 | | 2.813, |
783 | | 2.708, |
784 | | 2.621, |
785 | | }, |
786 | | { |
787 | | 7.055, |
788 | | 4.959, |
789 | | 4.109, |
790 | | 3.632, |
791 | | 3.323, |
792 | | 3.103, |
793 | | 2.937, |
794 | | 2.808, |
795 | | 2.703, |
796 | | 2.616, |
797 | | }, |
798 | | { |
799 | | 7.048, |
800 | | 4.953, |
801 | | 4.103, |
802 | | 3.627, |
803 | | 3.318, |
804 | | 3.098, |
805 | | 2.932, |
806 | | 2.803, |
807 | | 2.698, |
808 | | 2.611, |
809 | | }, |
810 | | { |
811 | | 7.042, |
812 | | 4.947, |
813 | | 4.098, |
814 | | 3.622, |
815 | | 3.313, |
816 | | 3.093, |
817 | | 2.928, |
818 | | 2.798, |
819 | | 2.693, |
820 | | 2.607, |
821 | | }, |
822 | | { |
823 | | 7.035, |
824 | | 4.942, |
825 | | 4.093, |
826 | | 3.618, |
827 | | 3.308, |
828 | | 3.088, |
829 | | 2.923, |
830 | | 2.793, |
831 | | 2.689, |
832 | | 2.602, |
833 | | }, |
834 | | { |
835 | | 7.029, |
836 | | 4.937, |
837 | | 4.088, |
838 | | 3.613, |
839 | | 3.304, |
840 | | 3.084, |
841 | | 2.919, |
842 | | 2.789, |
843 | | 2.684, |
844 | | 2.598, |
845 | | }, |
846 | | { |
847 | | 7.023, |
848 | | 4.932, |
849 | | 4.083, |
850 | | 3.608, |
851 | | 3.299, |
852 | | 3.080, |
853 | | 2.914, |
854 | | 2.785, |
855 | | 2.680, |
856 | | 2.593, |
857 | | }, |
858 | | { |
859 | | 7.017, |
860 | | 4.927, |
861 | | 4.079, |
862 | | 3.604, |
863 | | 3.295, |
864 | | 3.075, |
865 | | 2.910, |
866 | | 2.781, |
867 | | 2.676, |
868 | | 2.589, |
869 | | }, |
870 | | { |
871 | | 7.011, |
872 | | 4.922, |
873 | | 4.074, |
874 | | 3.600, |
875 | | 3.291, |
876 | | 3.071, |
877 | | 2.906, |
878 | | 2.777, |
879 | | 2.672, |
880 | | 2.585, |
881 | | }, |
882 | | { |
883 | | 7.006, |
884 | | 4.917, |
885 | | 4.070, |
886 | | 3.596, |
887 | | 3.287, |
888 | | 3.067, |
889 | | 2.902, |
890 | | 2.773, |
891 | | 2.668, |
892 | | 2.581, |
893 | | }, |
894 | | { |
895 | | 7.001, |
896 | | 4.913, |
897 | | 4.066, |
898 | | 3.591, |
899 | | 3.283, |
900 | | 3.063, |
901 | | 2.898, |
902 | | 2.769, |
903 | | 2.664, |
904 | | 2.578, |
905 | | }, |
906 | | { |
907 | | 6.995, |
908 | | 4.908, |
909 | | 4.062, |
910 | | 3.588, |
911 | | 3.279, |
912 | | 3.060, |
913 | | 2.895, |
914 | | 2.765, |
915 | | 2.660, |
916 | | 2.574, |
917 | | }, |
918 | | { |
919 | | 6.990, |
920 | | 4.904, |
921 | | 4.058, |
922 | | 3.584, |
923 | | 3.275, |
924 | | 3.056, |
925 | | 2.891, |
926 | | 2.762, |
927 | | 2.657, |
928 | | 2.570, |
929 | | }, |
930 | | { |
931 | | 6.985, |
932 | | 4.900, |
933 | | 4.054, |
934 | | 3.580, |
935 | | 3.272, |
936 | | 3.052, |
937 | | 2.887, |
938 | | 2.758, |
939 | | 2.653, |
940 | | 2.567, |
941 | | }, |
942 | | { |
943 | | 6.981, |
944 | | 4.896, |
945 | | 4.050, |
946 | | 3.577, |
947 | | 3.268, |
948 | | 3.049, |
949 | | 2.884, |
950 | | 2.755, |
951 | | 2.650, |
952 | | 2.563, |
953 | | }, |
954 | | { |
955 | | 6.976, |
956 | | 4.892, |
957 | | 4.047, |
958 | | 3.573, |
959 | | 3.265, |
960 | | 3.046, |
961 | | 2.881, |
962 | | 2.751, |
963 | | 2.647, |
964 | | 2.560, |
965 | | }, |
966 | | { |
967 | | 6.971, |
968 | | 4.888, |
969 | | 4.043, |
970 | | 3.570, |
971 | | 3.261, |
972 | | 3.042, |
973 | | 2.877, |
974 | | 2.748, |
975 | | 2.644, |
976 | | 2.557, |
977 | | }, |
978 | | { |
979 | | 6.967, |
980 | | 4.884, |
981 | | 4.040, |
982 | | 3.566, |
983 | | 3.258, |
984 | | 3.039, |
985 | | 2.874, |
986 | | 2.745, |
987 | | 2.640, |
988 | | 2.554, |
989 | | }, |
990 | | { |
991 | | 6.963, |
992 | | 4.881, |
993 | | 4.036, |
994 | | 3.563, |
995 | | 3.255, |
996 | | 3.036, |
997 | | 2.871, |
998 | | 2.742, |
999 | | 2.637, |
1000 | | 2.551, |
1001 | | }, |
1002 | | { |
1003 | | 6.958, |
1004 | | 4.877, |
1005 | | 4.033, |
1006 | | 3.560, |
1007 | | 3.252, |
1008 | | 3.033, |
1009 | | 2.868, |
1010 | | 2.739, |
1011 | | 2.634, |
1012 | | 2.548, |
1013 | | }, |
1014 | | { |
1015 | | 6.954, |
1016 | | 4.874, |
1017 | | 4.030, |
1018 | | 3.557, |
1019 | | 3.249, |
1020 | | 3.030, |
1021 | | 2.865, |
1022 | | 2.736, |
1023 | | 2.632, |
1024 | | 2.545, |
1025 | | }, |
1026 | | { |
1027 | | 6.950, |
1028 | | 4.870, |
1029 | | 4.027, |
1030 | | 3.554, |
1031 | | 3.246, |
1032 | | 3.027, |
1033 | | 2.863, |
1034 | | 2.733, |
1035 | | 2.629, |
1036 | | 2.542, |
1037 | | }, |
1038 | | { |
1039 | | 6.947, |
1040 | | 4.867, |
1041 | | 4.024, |
1042 | | 3.551, |
1043 | | 3.243, |
1044 | | 3.025, |
1045 | | 2.860, |
1046 | | 2.731, |
1047 | | 2.626, |
1048 | | 2.539, |
1049 | | }, |
1050 | | { |
1051 | | 6.943, |
1052 | | 4.864, |
1053 | | 4.021, |
1054 | | 3.548, |
1055 | | 3.240, |
1056 | | 3.022, |
1057 | | 2.857, |
1058 | | 2.728, |
1059 | | 2.623, |
1060 | | 2.537, |
1061 | | }, |
1062 | | { |
1063 | | 6.939, |
1064 | | 4.861, |
1065 | | 4.018, |
1066 | | 3.545, |
1067 | | 3.238, |
1068 | | 3.019, |
1069 | | 2.854, |
1070 | | 2.725, |
1071 | | 2.621, |
1072 | | 2.534, |
1073 | | }, |
1074 | | { |
1075 | | 6.935, |
1076 | | 4.858, |
1077 | | 4.015, |
1078 | | 3.543, |
1079 | | 3.235, |
1080 | | 3.017, |
1081 | | 2.852, |
1082 | | 2.723, |
1083 | | 2.618, |
1084 | | 2.532, |
1085 | | }, |
1086 | | { |
1087 | | 6.932, |
1088 | | 4.855, |
1089 | | 4.012, |
1090 | | 3.540, |
1091 | | 3.233, |
1092 | | 3.014, |
1093 | | 2.849, |
1094 | | 2.720, |
1095 | | 2.616, |
1096 | | 2.529, |
1097 | | }, |
1098 | | { |
1099 | | 6.928, |
1100 | | 4.852, |
1101 | | 4.010, |
1102 | | 3.538, |
1103 | | 3.230, |
1104 | | 3.012, |
1105 | | 2.847, |
1106 | | 2.718, |
1107 | | 2.613, |
1108 | | 2.527, |
1109 | | }, |
1110 | | { |
1111 | | 6.925, |
1112 | | 4.849, |
1113 | | 4.007, |
1114 | | 3.535, |
1115 | | 3.228, |
1116 | | 3.009, |
1117 | | 2.845, |
1118 | | 2.715, |
1119 | | 2.611, |
1120 | | 2.524, |
1121 | | }, |
1122 | | { |
1123 | | 6.922, |
1124 | | 4.846, |
1125 | | 4.004, |
1126 | | 3.533, |
1127 | | 3.225, |
1128 | | 3.007, |
1129 | | 2.842, |
1130 | | 2.713, |
1131 | | 2.609, |
1132 | | 2.522, |
1133 | | }, |
1134 | | { |
1135 | | 6.919, |
1136 | | 4.844, |
1137 | | 4.002, |
1138 | | 3.530, |
1139 | | 3.223, |
1140 | | 3.004, |
1141 | | 2.840, |
1142 | | 2.711, |
1143 | | 2.606, |
1144 | | 2.520, |
1145 | | }, |
1146 | | { |
1147 | | 6.915, |
1148 | | 4.841, |
1149 | | 3.999, |
1150 | | 3.528, |
1151 | | 3.221, |
1152 | | 3.002, |
1153 | | 2.838, |
1154 | | 2.709, |
1155 | | 2.604, |
1156 | | 2.518, |
1157 | | }, |
1158 | | { |
1159 | | 6.912, |
1160 | | 4.838, |
1161 | | 3.997, |
1162 | | 3.525, |
1163 | | 3.218, |
1164 | | 3.000, |
1165 | | 2.835, |
1166 | | 2.706, |
1167 | | 2.602, |
1168 | | 2.515, |
1169 | | }, |
1170 | | { |
1171 | | 6.909, |
1172 | | 4.836, |
1173 | | 3.995, |
1174 | | 3.523, |
1175 | | 3.216, |
1176 | | 2.998, |
1177 | | 2.833, |
1178 | | 2.704, |
1179 | | 2.600, |
1180 | | 2.513, |
1181 | | }, |
1182 | | { |
1183 | | 6.906, |
1184 | | 4.833, |
1185 | | 3.992, |
1186 | | 3.521, |
1187 | | 3.214, |
1188 | | 2.996, |
1189 | | 2.831, |
1190 | | 2.702, |
1191 | | 2.598, |
1192 | | 2.511, |
1193 | | }, |
1194 | | { |
1195 | | 6.904, |
1196 | | 4.831, |
1197 | | 3.990, |
1198 | | 3.519, |
1199 | | 3.212, |
1200 | | 2.994, |
1201 | | 2.829, |
1202 | | 2.700, |
1203 | | 2.596, |
1204 | | 2.509, |
1205 | | }, |
1206 | | { |
1207 | | 6.901, |
1208 | | 4.829, |
1209 | | 3.988, |
1210 | | 3.517, |
1211 | | 3.210, |
1212 | | 2.992, |
1213 | | 2.827, |
1214 | | 2.698, |
1215 | | 2.594, |
1216 | | 2.507, |
1217 | | }, |
1218 | | { |
1219 | | 6.898, |
1220 | | 4.826, |
1221 | | 3.986, |
1222 | | 3.515, |
1223 | | 3.208, |
1224 | | 2.990, |
1225 | | 2.825, |
1226 | | 2.696, |
1227 | | 2.592, |
1228 | | 2.505, |
1229 | | }, |
1230 | | {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}}; |
1231 | | |
1232 | | /** define the variance which will be used as a minimum variance for any |
1233 | | dimension of any feature. Since most features are calculated from numbers |
1234 | | with a precision no better than 1 in 128, the variance should never be |
1235 | | less than the square of this number for parameters whose range is 1. */ |
1236 | 0 | #define MINVARIANCE 0.0004 |
1237 | | |
1238 | | /** define the absolute minimum number of samples which must be present in |
1239 | | order to accurately test hypotheses about underlying probability |
1240 | | distributions. Define separately the minimum samples that are needed |
1241 | | before a statistical analysis is attempted; this number should be |
1242 | | equal to MINSAMPLES but can be set to a lower number for early testing |
1243 | | when very few samples are available. */ |
1244 | | #define MINSAMPLESPERBUCKET 5 |
1245 | | #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) |
1246 | 0 | #define MINSAMPLESNEEDED 1 |
1247 | | |
1248 | | /** define the size of the table which maps normalized samples to |
1249 | | histogram buckets. Also define the number of standard deviations |
1250 | | in a normal distribution which are considered to be significant. |
1251 | | The mapping table will be defined in such a way that it covers |
1252 | | the specified number of standard deviations on either side of |
1253 | | the mean. BUCKETTABLESIZE should always be even. */ |
1254 | 0 | #define BUCKETTABLESIZE 1024 |
1255 | | #define NORMALEXTENT 3.0 |
1256 | | |
1257 | | struct TEMPCLUSTER { |
1258 | | CLUSTER *Cluster; |
1259 | | CLUSTER *Neighbor; |
1260 | | }; |
1261 | | |
1262 | | using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>; |
1263 | | using ClusterHeap = tesseract::GenericHeap<ClusterPair>; |
1264 | | |
1265 | | struct STATISTICS { |
1266 | 0 | STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) { |
1267 | 0 | } |
1268 | | float AvgVariance = 1.0f; |
1269 | | std::vector<float> CoVariance; |
1270 | | std::vector<float> Min; // largest negative distance from the mean |
1271 | | std::vector<float> Max; // largest positive distance from the mean |
1272 | | }; |
1273 | | |
1274 | | struct BUCKETS { |
1275 | 0 | BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) { |
1276 | 0 | } |
1277 | 0 | ~BUCKETS() { |
1278 | 0 | } |
1279 | | DISTRIBUTION Distribution = normal; // distribution being tested for |
1280 | | uint32_t SampleCount = 0; // # of samples in histogram |
1281 | | double Confidence = 0.0; // confidence level of test |
1282 | | double ChiSquared = 0.0; // test threshold |
1283 | | uint16_t NumberOfBuckets; // number of cells in histogram |
1284 | | uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets |
1285 | | std::vector<uint32_t> Count; // frequency of occurrence histogram |
1286 | | std::vector<float> ExpectedCount; // expected histogram |
1287 | | }; |
1288 | | |
1289 | | struct CHISTRUCT { |
1290 | | /// This constructor allocates a new data structure which is used |
1291 | | /// to hold a chi-squared value along with its associated |
1292 | | /// number of degrees of freedom and alpha value. |
1293 | | /// |
1294 | | /// @param degreesOfFreedom degrees of freedom for new chi value |
1295 | | /// @param alpha confidence level for new chi value |
1296 | 0 | CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) { |
1297 | 0 | } |
1298 | | uint16_t DegreesOfFreedom = 0; |
1299 | | double Alpha = 0.0; |
1300 | | double ChiSquared = 0.0; |
1301 | | }; |
1302 | | |
1303 | | // For use with KDWalk / MakePotentialClusters |
1304 | | struct ClusteringContext { |
1305 | | ClusterHeap *heap; // heap used to hold temp clusters, "best" on top |
1306 | | TEMPCLUSTER *candidates; // array of potential clusters |
1307 | | KDTREE *tree; // kd-tree to be searched for neighbors |
1308 | | int32_t next; // next candidate to be used |
1309 | | }; |
1310 | | |
1311 | | using DENSITYFUNC = double (*)(int32_t); |
1312 | | using SOLVEFUNC = double (*)(CHISTRUCT *, double); |
1313 | | |
1314 | 0 | #define Odd(N) ((N) % 2) |
1315 | 0 | #define Mirror(N, R) ((R) - (N)-1) |
1316 | 0 | #define Abs(N) (((N) < 0) ? (-(N)) : (N)) |
1317 | | |
1318 | | //--------------Global Data Definitions and Declarations---------------------- |
1319 | | /** the following variables describe a discrete normal distribution |
1320 | | which is used by NormalDensity() and NormalBucket(). The |
1321 | | constant NORMALEXTENT determines how many standard |
1322 | | deviations of the distribution are mapped onto the fixed |
1323 | | discrete range of x. x=0 is mapped to -NORMALEXTENT standard |
1324 | | deviations and x=BUCKETTABLESIZE is mapped to |
1325 | | +NORMALEXTENT standard deviations. */ |
1326 | | #define SqrtOf2Pi 2.506628275 |
1327 | | static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); |
1328 | | static const double kNormalVariance = |
1329 | | (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); |
1330 | | static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); |
1331 | | static const double kNormalMean = BUCKETTABLESIZE / 2; |
1332 | | |
1333 | | /** define lookup tables used to compute the number of histogram buckets |
1334 | | that should be used for a given number of samples. */ |
1335 | 0 | #define LOOKUPTABLESIZE 8 |
1336 | | #define MAXDEGREESOFFREEDOM MAXBUCKETS |
1337 | | |
1338 | | static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800, |
1339 | | 1000, 1500, 2000}; // number of samples |
1340 | | |
1341 | | static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = { |
1342 | | MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets |
1343 | | |
1344 | | /*------------------------------------------------------------------------- |
1345 | | Private Function Prototypes |
1346 | | --------------------------------------------------------------------------*/ |
1347 | | static void CreateClusterTree(CLUSTERER *Clusterer); |
1348 | | |
1349 | | static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level); |
1350 | | |
1351 | | static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance); |
1352 | | |
1353 | | static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); |
1354 | | |
1355 | | static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); |
1356 | | |
1357 | | static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster); |
1358 | | |
1359 | | static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, |
1360 | | PROTOSTYLE Style, int32_t MinSamples); |
1361 | | |
1362 | | static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, |
1363 | | STATISTICS *Statistics); |
1364 | | |
1365 | | static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, |
1366 | | BUCKETS *Buckets); |
1367 | | |
1368 | | static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, |
1369 | | STATISTICS *Statistics, BUCKETS *Buckets); |
1370 | | |
1371 | | static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, |
1372 | | BUCKETS *NormalBuckets, double Confidence); |
1373 | | |
1374 | | static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); |
1375 | | |
1376 | | static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics); |
1377 | | |
1378 | | static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster); |
1379 | | |
1380 | | static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics); |
1381 | | |
1382 | | static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); |
1383 | | |
1384 | | static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); |
1385 | | |
1386 | | static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster); |
1387 | | |
1388 | | static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence); |
1389 | | |
1390 | | static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, |
1391 | | double Confidence); |
1392 | | |
1393 | | static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence); |
1394 | | |
1395 | | static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount); |
1396 | | |
1397 | | static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha); |
1398 | | |
1399 | | static double NormalDensity(int32_t x); |
1400 | | |
1401 | | static double UniformDensity(int32_t x); |
1402 | | |
1403 | | static double Integral(double f1, double f2, double Dx); |
1404 | | |
1405 | | static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, |
1406 | | float Mean, float StdDev); |
1407 | | |
1408 | | static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); |
1409 | | |
1410 | | static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); |
1411 | | |
1412 | | static bool DistributionOK(BUCKETS *Buckets); |
1413 | | |
1414 | | static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets); |
1415 | | |
1416 | | static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount); |
1417 | | |
1418 | | static void InitBuckets(BUCKETS *Buckets); |
1419 | | |
1420 | | static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, |
1421 | | void *arg2); // CHISTRUCT *SearchKey); |
1422 | | |
1423 | | static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy); |
1424 | | |
1425 | | static double ChiArea(CHISTRUCT *ChiParams, double x); |
1426 | | |
1427 | | static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal); |
1428 | | |
1429 | | static double InvertMatrix(const float *input, int size, float *inv); |
1430 | | |
1431 | | //--------------------------Public Code-------------------------------------- |
1432 | | /** |
1433 | | * This routine creates a new clusterer data structure, |
1434 | | * initializes it, and returns a pointer to it. |
1435 | | * |
1436 | | * @param SampleSize number of dimensions in feature space |
1437 | | * @param ParamDesc description of each dimension |
1438 | | * @return pointer to the new clusterer data structure |
1439 | | */ |
1440 | 0 | CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) { |
1441 | 0 | int i; |
1442 | | |
1443 | | // allocate main clusterer data structure and init simple fields |
1444 | 0 | auto Clusterer = new CLUSTERER; |
1445 | 0 | Clusterer->SampleSize = SampleSize; |
1446 | 0 | Clusterer->NumberOfSamples = 0; |
1447 | 0 | Clusterer->NumChar = 0; |
1448 | | |
1449 | | // init fields which will not be used initially |
1450 | 0 | Clusterer->Root = nullptr; |
1451 | 0 | Clusterer->ProtoList = NIL_LIST; |
1452 | | |
1453 | | // maintain a copy of param descriptors in the clusterer data structure |
1454 | 0 | Clusterer->ParamDesc = new PARAM_DESC[SampleSize]; |
1455 | 0 | for (i = 0; i < SampleSize; i++) { |
1456 | 0 | Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; |
1457 | 0 | Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; |
1458 | 0 | Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; |
1459 | 0 | Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; |
1460 | 0 | Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; |
1461 | 0 | Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; |
1462 | 0 | Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2; |
1463 | 0 | } |
1464 | | |
1465 | | // allocate a kd tree to hold the samples |
1466 | 0 | Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc); |
1467 | | |
1468 | | // Initialize cache of histogram buckets to minimize recomputing them. |
1469 | 0 | for (auto &d : Clusterer->bucket_cache) { |
1470 | 0 | for (auto &c : d) { |
1471 | 0 | c = nullptr; |
1472 | 0 | } |
1473 | 0 | } |
1474 | |
|
1475 | 0 | return Clusterer; |
1476 | 0 | } // MakeClusterer |
1477 | | |
1478 | | /** |
1479 | | * This routine creates a new sample data structure to hold |
1480 | | * the specified feature. This sample is added to the clusterer |
1481 | | * data structure (so that it knows which samples are to be |
1482 | | * clustered later), and a pointer to the sample is returned to |
1483 | | * the caller. |
1484 | | * |
1485 | | * @param Clusterer clusterer data structure to add sample to |
1486 | | * @param Feature feature to be added to clusterer |
1487 | | * @param CharID unique ident. of char that sample came from |
1488 | | * |
1489 | | * @return Pointer to the new sample data structure |
1490 | | */ |
1491 | 0 | SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) { |
1492 | 0 | int i; |
1493 | | |
1494 | | // see if the samples have already been clustered - if so trap an error |
1495 | | // Can't add samples after they have been clustered. |
1496 | 0 | ASSERT_HOST(Clusterer->Root == nullptr); |
1497 | | |
1498 | | // allocate the new sample and initialize it |
1499 | 0 | auto Sample = new SAMPLE(Clusterer->SampleSize); |
1500 | 0 | Sample->Clustered = false; |
1501 | 0 | Sample->Prototype = false; |
1502 | 0 | Sample->SampleCount = 1; |
1503 | 0 | Sample->Left = nullptr; |
1504 | 0 | Sample->Right = nullptr; |
1505 | 0 | Sample->CharID = CharID; |
1506 | |
|
1507 | 0 | for (i = 0; i < Clusterer->SampleSize; i++) { |
1508 | 0 | Sample->Mean[i] = Feature[i]; |
1509 | 0 | } |
1510 | | |
1511 | | // add the sample to the KD tree - keep track of the total # of samples |
1512 | 0 | Clusterer->NumberOfSamples++; |
1513 | 0 | KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample); |
1514 | 0 | if (CharID >= Clusterer->NumChar) { |
1515 | 0 | Clusterer->NumChar = CharID + 1; |
1516 | 0 | } |
1517 | | |
1518 | | // execute hook for monitoring clustering operation |
1519 | | // (*SampleCreationHook)(Sample); |
1520 | |
|
1521 | 0 | return (Sample); |
1522 | 0 | } // MakeSample |
1523 | | |
1524 | | /** |
1525 | | * This routine first checks to see if the samples in this |
1526 | | * clusterer have already been clustered before; if so, it does |
1527 | | * not bother to recreate the cluster tree. It simply recomputes |
1528 | | * the prototypes based on the new Config info. |
1529 | | * |
1530 | | * If the samples have not been clustered before, the |
1531 | | * samples in the KD tree are formed into a cluster tree and then |
1532 | | * the prototypes are computed from the cluster tree. |
1533 | | * |
1534 | | * In either case this routine returns a pointer to a |
1535 | | * list of prototypes that best represent the samples given |
1536 | | * the constraints specified in Config. |
1537 | | * |
1538 | | * @param Clusterer data struct containing samples to be clustered |
1539 | | * @param Config parameters which control clustering process |
1540 | | * |
1541 | | * @return Pointer to a list of prototypes |
1542 | | */ |
1543 | 0 | LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { |
1544 | | // only create cluster tree if samples have never been clustered before |
1545 | 0 | if (Clusterer->Root == nullptr) { |
1546 | 0 | CreateClusterTree(Clusterer); |
1547 | 0 | } |
1548 | | |
1549 | | // deallocate the old prototype list if one exists |
1550 | 0 | FreeProtoList(&Clusterer->ProtoList); |
1551 | 0 | Clusterer->ProtoList = NIL_LIST; |
1552 | | |
1553 | | // compute prototypes starting at the root node in the tree |
1554 | 0 | ComputePrototypes(Clusterer, Config); |
1555 | | // We don't need the cluster pointers in the protos any more, so null them |
1556 | | // out, which makes it safe to delete the clusterer. |
1557 | 0 | LIST proto_list = Clusterer->ProtoList; |
1558 | 0 | iterate(proto_list) { |
1559 | 0 | auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node()); |
1560 | 0 | proto->Cluster = nullptr; |
1561 | 0 | } |
1562 | 0 | return Clusterer->ProtoList; |
1563 | 0 | } // ClusterSamples |
1564 | | |
1565 | | /** |
1566 | | * This routine frees all of the memory allocated to the |
1567 | | * specified data structure. It will not, however, free |
1568 | | * the memory used by the prototype list. The pointers to |
1569 | | * the clusters for each prototype in the list will be set |
1570 | | * to nullptr to indicate that the cluster data structures no |
1571 | | * longer exist. Any sample lists that have been obtained |
1572 | | * via calls to GetSamples are no longer valid. |
1573 | | * @param Clusterer pointer to data structure to be freed |
1574 | | */ |
1575 | 0 | void FreeClusterer(CLUSTERER *Clusterer) { |
1576 | 0 | if (Clusterer != nullptr) { |
1577 | 0 | delete[] Clusterer->ParamDesc; |
1578 | 0 | delete Clusterer->KDTree; |
1579 | 0 | delete Clusterer->Root; |
1580 | | // Free up all used buckets structures. |
1581 | 0 | for (auto &d : Clusterer->bucket_cache) { |
1582 | 0 | for (auto &c : d) { |
1583 | 0 | delete c; |
1584 | 0 | } |
1585 | 0 | } |
1586 | |
|
1587 | 0 | delete Clusterer; |
1588 | 0 | } |
1589 | 0 | } // FreeClusterer |
1590 | | |
1591 | | /** |
1592 | | * This routine frees all of the memory allocated to the |
1593 | | * specified list of prototypes. The clusters which are |
1594 | | * pointed to by the prototypes are not freed. |
1595 | | * @param ProtoList pointer to list of prototypes to be freed |
1596 | | */ |
1597 | 0 | void FreeProtoList(LIST *ProtoList) { |
1598 | 0 | destroy_nodes(*ProtoList, FreePrototype); |
1599 | 0 | } // FreeProtoList |
1600 | | |
1601 | | /** |
1602 | | * This routine deallocates the memory consumed by the specified |
1603 | | * prototype and modifies the corresponding cluster so that it |
1604 | | * is no longer marked as a prototype. The cluster is NOT |
1605 | | * deallocated by this routine. |
1606 | | * @param arg prototype data structure to be deallocated |
1607 | | */ |
1608 | 0 | void FreePrototype(void *arg) { // PROTOTYPE *Prototype) |
1609 | 0 | auto *Prototype = static_cast<PROTOTYPE *>(arg); |
1610 | | |
1611 | | // unmark the corresponding cluster (if there is one |
1612 | 0 | if (Prototype->Cluster != nullptr) { |
1613 | 0 | Prototype->Cluster->Prototype = false; |
1614 | 0 | } |
1615 | | |
1616 | | // deallocate the prototype statistics and then the prototype itself |
1617 | 0 | if (Prototype->Style != spherical) { |
1618 | 0 | delete[] Prototype->Variance.Elliptical; |
1619 | 0 | delete[] Prototype->Magnitude.Elliptical; |
1620 | 0 | delete[] Prototype->Weight.Elliptical; |
1621 | 0 | } |
1622 | 0 | delete Prototype; |
1623 | 0 | } // FreePrototype |
1624 | | |
1625 | | /** |
1626 | | * This routine is used to find all of the samples which |
1627 | | * belong to a cluster. It starts by removing the top |
1628 | | * cluster on the cluster list (SearchState). If this cluster is |
1629 | | * a leaf it is returned. Otherwise, the right subcluster |
1630 | | * is pushed on the list and we continue the search in the |
1631 | | * left subcluster. This continues until a leaf is found. |
1632 | | * If all samples have been found, nullptr is returned. |
1633 | | * InitSampleSearch() must be called |
1634 | | * before NextSample() to initialize the search. |
1635 | | * @param SearchState ptr to list containing clusters to be searched |
1636 | | * @return Pointer to the next leaf cluster (sample) or nullptr. |
1637 | | */ |
1638 | 0 | CLUSTER *NextSample(LIST *SearchState) { |
1639 | 0 | CLUSTER *Cluster; |
1640 | |
|
1641 | 0 | if (*SearchState == NIL_LIST) { |
1642 | 0 | return (nullptr); |
1643 | 0 | } |
1644 | 0 | Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node()); |
1645 | 0 | *SearchState = pop(*SearchState); |
1646 | 0 | for (;;) { |
1647 | 0 | if (Cluster->Left == nullptr) { |
1648 | 0 | return (Cluster); |
1649 | 0 | } |
1650 | 0 | *SearchState = push(*SearchState, Cluster->Right); |
1651 | 0 | Cluster = Cluster->Left; |
1652 | 0 | } |
1653 | 0 | } // NextSample |
1654 | | |
1655 | | /** |
1656 | | * This routine returns the mean of the specified |
1657 | | * prototype in the indicated dimension. |
1658 | | * @param Proto prototype to return mean of |
1659 | | * @param Dimension dimension whose mean is to be returned |
1660 | | * @return Mean of Prototype in Dimension |
1661 | | */ |
1662 | 0 | float Mean(PROTOTYPE *Proto, uint16_t Dimension) { |
1663 | 0 | return (Proto->Mean[Dimension]); |
1664 | 0 | } // Mean |
1665 | | |
1666 | | /** |
1667 | | * This routine returns the standard deviation of the |
1668 | | * prototype in the indicated dimension. |
1669 | | * @param Proto prototype to return standard deviation of |
1670 | | * @param Dimension dimension whose stddev is to be returned |
1671 | | * @return Standard deviation of Prototype in Dimension |
1672 | | */ |
1673 | 0 | float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) { |
1674 | 0 | switch (Proto->Style) { |
1675 | 0 | case spherical: |
1676 | 0 | return std::sqrt(Proto->Variance.Spherical); |
1677 | 0 | case elliptical: |
1678 | 0 | return std::sqrt(Proto->Variance.Elliptical[Dimension]); |
1679 | 0 | case mixed: |
1680 | 0 | switch (Proto->Distrib[Dimension]) { |
1681 | 0 | case normal: |
1682 | 0 | return std::sqrt(Proto->Variance.Elliptical[Dimension]); |
1683 | 0 | case uniform: |
1684 | 0 | case D_random: |
1685 | 0 | return Proto->Variance.Elliptical[Dimension]; |
1686 | 0 | case DISTRIBUTION_COUNT: |
1687 | 0 | ASSERT_HOST(!"Distribution count not allowed!"); |
1688 | 0 | } |
1689 | 0 | } |
1690 | 0 | return 0.0f; |
1691 | 0 | } // StandardDeviation |
1692 | | |
1693 | | /*--------------------------------------------------------------------------- |
1694 | | Private Code |
1695 | | ----------------------------------------------------------------------------*/ |
1696 | | /** |
1697 | | * This routine performs a bottoms-up clustering on the samples |
1698 | | * held in the kd-tree of the Clusterer data structure. The |
1699 | | * result is a cluster tree. Each node in the tree represents |
1700 | | * a cluster which conceptually contains a subset of the samples. |
1701 | | * More precisely, the cluster contains all of the samples which |
1702 | | * are contained in its two sub-clusters. The leaves of the |
1703 | | * tree are the individual samples themselves; they have no |
1704 | | * sub-clusters. The root node of the tree conceptually contains |
1705 | | * all of the samples. |
1706 | | * The Clusterer data structure is changed. |
1707 | | * @param Clusterer data structure holdings samples to be clustered |
1708 | | */ |
1709 | 0 | static void CreateClusterTree(CLUSTERER *Clusterer) { |
1710 | 0 | ClusteringContext context; |
1711 | 0 | ClusterPair HeapEntry; |
1712 | | |
1713 | | // each sample and its nearest neighbor form a "potential" cluster |
1714 | | // save these in a heap with the "best" potential clusters on top |
1715 | 0 | context.tree = Clusterer->KDTree; |
1716 | 0 | context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples]; |
1717 | 0 | context.next = 0; |
1718 | 0 | context.heap = new ClusterHeap(Clusterer->NumberOfSamples); |
1719 | 0 | KDWalk(context.tree, MakePotentialClusters, &context); |
1720 | | |
1721 | | // form potential clusters into actual clusters - always do "best" first |
1722 | 0 | while (context.heap->Pop(&HeapEntry)) { |
1723 | 0 | TEMPCLUSTER *PotentialCluster = HeapEntry.data(); |
1724 | | |
1725 | | // if main cluster of potential cluster is already in another cluster |
1726 | | // then we don't need to worry about it |
1727 | 0 | if (PotentialCluster->Cluster->Clustered) { |
1728 | 0 | continue; |
1729 | 0 | } |
1730 | | |
1731 | | // if main cluster is not yet clustered, but its nearest neighbor is |
1732 | | // then we must find a new nearest neighbor |
1733 | 0 | else if (PotentialCluster->Neighbor->Clustered) { |
1734 | 0 | PotentialCluster->Neighbor = |
1735 | 0 | FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); |
1736 | 0 | if (PotentialCluster->Neighbor != nullptr) { |
1737 | 0 | context.heap->Push(&HeapEntry); |
1738 | 0 | } |
1739 | 0 | } |
1740 | | |
1741 | | // if neither cluster is already clustered, form permanent cluster |
1742 | 0 | else { |
1743 | 0 | PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster); |
1744 | 0 | PotentialCluster->Neighbor = |
1745 | 0 | FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); |
1746 | 0 | if (PotentialCluster->Neighbor != nullptr) { |
1747 | 0 | context.heap->Push(&HeapEntry); |
1748 | 0 | } |
1749 | 0 | } |
1750 | 0 | } |
1751 | | |
1752 | | // the root node in the cluster tree is now the only node in the kd-tree |
1753 | 0 | Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree); |
1754 | | |
1755 | | // free up the memory used by the K-D tree, heap, and temp clusters |
1756 | 0 | delete context.tree; |
1757 | 0 | Clusterer->KDTree = nullptr; |
1758 | 0 | delete context.heap; |
1759 | 0 | delete[] context.candidates; |
1760 | 0 | } // CreateClusterTree |
1761 | | |
1762 | | /** |
1763 | | * This routine is designed to be used in concert with the |
1764 | | * KDWalk routine. It will create a potential cluster for |
1765 | | * each sample in the kd-tree that is being walked. This |
1766 | | * potential cluster will then be pushed on the heap. |
1767 | | * @param context ClusteringContext (see definition above) |
1768 | | * @param Cluster current cluster being visited in kd-tree walk |
1769 | | * @param Level level of this cluster in the kd-tree |
1770 | | */ |
1771 | 0 | static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) { |
1772 | 0 | ClusterPair HeapEntry; |
1773 | 0 | int next = context->next; |
1774 | 0 | context->candidates[next].Cluster = Cluster; |
1775 | 0 | HeapEntry.data() = &(context->candidates[next]); |
1776 | 0 | context->candidates[next].Neighbor = |
1777 | 0 | FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key()); |
1778 | 0 | if (context->candidates[next].Neighbor != nullptr) { |
1779 | 0 | context->heap->Push(&HeapEntry); |
1780 | 0 | context->next++; |
1781 | 0 | } |
1782 | 0 | } // MakePotentialClusters |
1783 | | |
1784 | | /** |
1785 | | * This routine searches the specified kd-tree for the nearest |
1786 | | * neighbor of the specified cluster. It actually uses the |
1787 | | * kd routines to find the 2 nearest neighbors since one of them |
1788 | | * will be the original cluster. A pointer to the nearest |
1789 | | * neighbor is returned, if it can be found, otherwise nullptr is |
1790 | | * returned. The distance between the 2 nodes is placed |
1791 | | * in the specified variable. |
1792 | | * @param Tree kd-tree to search in for nearest neighbor |
1793 | | * @param Cluster cluster whose nearest neighbor is to be found |
1794 | | * @param Distance ptr to variable to report distance found |
1795 | | * @return Pointer to the nearest neighbor of Cluster, or nullptr |
1796 | | */ |
1797 | | static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance) |
1798 | 0 | #define MAXNEIGHBORS 2 |
1799 | 0 | #define MAXDISTANCE FLT_MAX |
1800 | 0 | { |
1801 | 0 | CLUSTER *Neighbor[MAXNEIGHBORS]; |
1802 | 0 | float Dist[MAXNEIGHBORS]; |
1803 | 0 | int NumberOfNeighbors; |
1804 | 0 | int32_t i; |
1805 | 0 | CLUSTER *BestNeighbor; |
1806 | | |
1807 | | // find the 2 nearest neighbors of the cluster |
1808 | 0 | KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors, |
1809 | 0 | reinterpret_cast<void **>(Neighbor), Dist); |
1810 | | |
1811 | | // search for the nearest neighbor that is not the cluster itself |
1812 | 0 | *Distance = MAXDISTANCE; |
1813 | 0 | BestNeighbor = nullptr; |
1814 | 0 | for (i = 0; i < NumberOfNeighbors; i++) { |
1815 | 0 | if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { |
1816 | 0 | *Distance = Dist[i]; |
1817 | 0 | BestNeighbor = Neighbor[i]; |
1818 | 0 | } |
1819 | 0 | } |
1820 | 0 | return BestNeighbor; |
1821 | 0 | } // FindNearestNeighbor |
1822 | | |
1823 | | /** |
1824 | | * This routine creates a new permanent cluster from the |
1825 | | * clusters specified in TempCluster. The 2 clusters in |
1826 | | * TempCluster are marked as "clustered" and deleted from |
1827 | | * the kd-tree. The new cluster is then added to the kd-tree. |
1828 | | * @param Clusterer current clustering environment |
1829 | | * @param TempCluster potential cluster to make permanent |
1830 | | * @return Pointer to the new permanent cluster |
1831 | | */ |
1832 | 0 | static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { |
1833 | | // allocate the new cluster and initialize it |
1834 | 0 | auto Cluster = new CLUSTER(Clusterer->SampleSize); |
1835 | 0 | Cluster->Clustered = false; |
1836 | 0 | Cluster->Prototype = false; |
1837 | 0 | Cluster->Left = TempCluster->Cluster; |
1838 | 0 | Cluster->Right = TempCluster->Neighbor; |
1839 | 0 | Cluster->CharID = -1; |
1840 | | |
1841 | | // mark the old clusters as "clustered" and delete them from the kd-tree |
1842 | 0 | Cluster->Left->Clustered = true; |
1843 | 0 | Cluster->Right->Clustered = true; |
1844 | 0 | KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left); |
1845 | 0 | KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right); |
1846 | | |
1847 | | // compute the mean and sample count for the new cluster |
1848 | 0 | Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, |
1849 | 0 | Cluster->Left->SampleCount, Cluster->Right->SampleCount, |
1850 | 0 | &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]); |
1851 | | |
1852 | | // add the new cluster to the KD tree |
1853 | 0 | KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster); |
1854 | 0 | return Cluster; |
1855 | 0 | } // MakeNewCluster |
1856 | | |
1857 | | /** |
1858 | | * This routine merges two clusters into one larger cluster. |
1859 | | * To do this it computes the number of samples in the new |
1860 | | * cluster and the mean of the new cluster. The ParamDesc |
1861 | | * information is used to ensure that circular dimensions |
1862 | | * are handled correctly. |
1863 | | * @param N # of dimensions (size of arrays) |
1864 | | * @param ParamDesc array of dimension descriptions |
1865 | | * @param n1, n2 number of samples in each old cluster |
1866 | | * @param m array to hold mean of new cluster |
1867 | | * @param m1, m2 arrays containing means of old clusters |
1868 | | * @return The number of samples in the new cluster. |
1869 | | */ |
1870 | | int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], |
1871 | 0 | float m1[], float m2[]) { |
1872 | 0 | int32_t i, n; |
1873 | |
|
1874 | 0 | n = n1 + n2; |
1875 | 0 | for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { |
1876 | 0 | if (ParamDesc->Circular) { |
1877 | | // if distance between means is greater than allowed |
1878 | | // reduce upper point by one "rotation" to compute mean |
1879 | | // then normalize the mean back into the accepted range |
1880 | 0 | if ((*m2 - *m1) > ParamDesc->HalfRange) { |
1881 | 0 | *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; |
1882 | 0 | if (*m < ParamDesc->Min) { |
1883 | 0 | *m += ParamDesc->Range; |
1884 | 0 | } |
1885 | 0 | } else if ((*m1 - *m2) > ParamDesc->HalfRange) { |
1886 | 0 | *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; |
1887 | 0 | if (*m < ParamDesc->Min) { |
1888 | 0 | *m += ParamDesc->Range; |
1889 | 0 | } |
1890 | 0 | } else { |
1891 | 0 | *m = (n1 * *m1 + n2 * *m2) / n; |
1892 | 0 | } |
1893 | 0 | } else { |
1894 | 0 | *m = (n1 * *m1 + n2 * *m2) / n; |
1895 | 0 | } |
1896 | 0 | } |
1897 | 0 | return n; |
1898 | 0 | } // MergeClusters |
1899 | | |
1900 | | /** |
1901 | | * This routine decides which clusters in the cluster tree |
1902 | | * should be represented by prototypes, forms a list of these |
1903 | | * prototypes, and places the list in the Clusterer data |
1904 | | * structure. |
1905 | | * @param Clusterer data structure holding cluster tree |
1906 | | * @param Config parameters used to control prototype generation |
1907 | | */ |
1908 | 0 | static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { |
1909 | 0 | LIST ClusterStack = NIL_LIST; |
1910 | 0 | CLUSTER *Cluster; |
1911 | 0 | PROTOTYPE *Prototype; |
1912 | | |
1913 | | // use a stack to keep track of clusters waiting to be processed |
1914 | | // initially the only cluster on the stack is the root cluster |
1915 | 0 | if (Clusterer->Root != nullptr) { |
1916 | 0 | ClusterStack = push(NIL_LIST, Clusterer->Root); |
1917 | 0 | } |
1918 | | |
1919 | | // loop until we have analyzed all clusters which are potential prototypes |
1920 | 0 | while (ClusterStack != NIL_LIST) { |
1921 | | // remove the next cluster to be analyzed from the stack |
1922 | | // try to make a prototype from the cluster |
1923 | | // if successful, put it on the proto list, else split the cluster |
1924 | 0 | Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node()); |
1925 | 0 | ClusterStack = pop(ClusterStack); |
1926 | 0 | Prototype = MakePrototype(Clusterer, Config, Cluster); |
1927 | 0 | if (Prototype != nullptr) { |
1928 | 0 | Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype); |
1929 | 0 | } else { |
1930 | 0 | ClusterStack = push(ClusterStack, Cluster->Right); |
1931 | 0 | ClusterStack = push(ClusterStack, Cluster->Left); |
1932 | 0 | } |
1933 | 0 | } |
1934 | 0 | } // ComputePrototypes |
1935 | | |
1936 | | /** |
1937 | | * This routine attempts to create a prototype from the |
1938 | | * specified cluster that conforms to the distribution |
1939 | | * specified in Config. If there are too few samples in the |
1940 | | * cluster to perform a statistical analysis, then a prototype |
1941 | | * is generated but labelled as insignificant. If the |
1942 | | * dimensions of the cluster are not independent, no prototype |
1943 | | * is generated and nullptr is returned. If a prototype can be |
1944 | | * found that matches the desired distribution then a pointer |
1945 | | * to it is returned, otherwise nullptr is returned. |
1946 | | * @param Clusterer data structure holding cluster tree |
1947 | | * @param Config parameters used to control prototype generation |
1948 | | * @param Cluster cluster to be made into a prototype |
1949 | | * @return Pointer to new prototype or nullptr |
1950 | | */ |
1951 | 0 | static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) { |
1952 | 0 | PROTOTYPE *Proto; |
1953 | 0 | BUCKETS *Buckets; |
1954 | | |
1955 | | // filter out clusters which contain samples from the same character |
1956 | 0 | if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) { |
1957 | 0 | return nullptr; |
1958 | 0 | } |
1959 | | |
1960 | | // compute the covariance matrix and ranges for the cluster |
1961 | 0 | auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); |
1962 | | |
1963 | | // check for degenerate clusters which need not be analyzed further |
1964 | | // note that the MinSamples test assumes that all clusters with multiple |
1965 | | // character samples have been removed (as above) |
1966 | 0 | Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, |
1967 | 0 | static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar)); |
1968 | 0 | if (Proto != nullptr) { |
1969 | 0 | delete Statistics; |
1970 | 0 | return Proto; |
1971 | 0 | } |
1972 | | // check to ensure that all dimensions are independent |
1973 | 0 | if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0], |
1974 | 0 | Config->Independence)) { |
1975 | 0 | delete Statistics; |
1976 | 0 | return nullptr; |
1977 | 0 | } |
1978 | | |
1979 | 0 | if (HOTELLING && Config->ProtoStyle == elliptical) { |
1980 | 0 | Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); |
1981 | 0 | if (Proto != nullptr) { |
1982 | 0 | delete Statistics; |
1983 | 0 | return Proto; |
1984 | 0 | } |
1985 | 0 | } |
1986 | | |
1987 | | // create a histogram data structure used to evaluate distributions |
1988 | 0 | Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence); |
1989 | | |
1990 | | // create a prototype based on the statistics and test it |
1991 | 0 | switch (Config->ProtoStyle) { |
1992 | 0 | case spherical: |
1993 | 0 | Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); |
1994 | 0 | break; |
1995 | 0 | case elliptical: |
1996 | 0 | Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); |
1997 | 0 | break; |
1998 | 0 | case mixed: |
1999 | 0 | Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); |
2000 | 0 | break; |
2001 | 0 | case automatic: |
2002 | 0 | Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); |
2003 | 0 | if (Proto != nullptr) { |
2004 | 0 | break; |
2005 | 0 | } |
2006 | 0 | Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); |
2007 | 0 | if (Proto != nullptr) { |
2008 | 0 | break; |
2009 | 0 | } |
2010 | 0 | Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); |
2011 | 0 | break; |
2012 | 0 | } |
2013 | 0 | delete Statistics; |
2014 | 0 | return Proto; |
2015 | 0 | } // MakePrototype |
2016 | | |
2017 | | /** |
2018 | | * This routine checks for clusters which are degenerate and |
2019 | | * therefore cannot be analyzed in a statistically valid way. |
2020 | | * A cluster is defined as degenerate if it does not have at |
2021 | | * least MINSAMPLESNEEDED samples in it. If the cluster is |
2022 | | * found to be degenerate, a prototype of the specified style |
2023 | | * is generated and marked as insignificant. A cluster is |
2024 | | * also degenerate if it does not have at least MinSamples |
2025 | | * samples in it. |
2026 | | * |
2027 | | * If the cluster is not degenerate, nullptr is returned. |
2028 | | * |
2029 | | * @param N number of dimensions |
2030 | | * @param Cluster cluster being analyzed |
2031 | | * @param Statistics statistical info about cluster |
2032 | | * @param Style type of prototype to be generated |
2033 | | * @param MinSamples minimum number of samples in a cluster |
2034 | | * @return Pointer to degenerate prototype or nullptr. |
2035 | | */ |
2036 | | static PROTOTYPE *MakeDegenerateProto( // this was MinSample |
2037 | 0 | uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) { |
2038 | 0 | PROTOTYPE *Proto = nullptr; |
2039 | |
|
2040 | 0 | if (MinSamples < MINSAMPLESNEEDED) { |
2041 | 0 | MinSamples = MINSAMPLESNEEDED; |
2042 | 0 | } |
2043 | |
|
2044 | 0 | if (Cluster->SampleCount < MinSamples) { |
2045 | 0 | switch (Style) { |
2046 | 0 | case spherical: |
2047 | 0 | Proto = NewSphericalProto(N, Cluster, Statistics); |
2048 | 0 | break; |
2049 | 0 | case elliptical: |
2050 | 0 | case automatic: |
2051 | 0 | Proto = NewEllipticalProto(N, Cluster, Statistics); |
2052 | 0 | break; |
2053 | 0 | case mixed: |
2054 | 0 | Proto = NewMixedProto(N, Cluster, Statistics); |
2055 | 0 | break; |
2056 | 0 | } |
2057 | 0 | Proto->Significant = false; |
2058 | 0 | } |
2059 | 0 | return (Proto); |
2060 | 0 | } // MakeDegenerateProto |
2061 | | |
2062 | | /** |
2063 | | * This routine tests the specified cluster to see if ** |
2064 | | * there is a statistically significant difference between |
2065 | | * the sub-clusters that would be made if the cluster were to |
2066 | | * be split. If not, then a new prototype is formed and |
2067 | | * returned to the caller. If there is, then nullptr is returned |
2068 | | * to the caller. |
2069 | | * @param Clusterer data struct containing samples being clustered |
2070 | | * @param Config provides the magic number of samples that make a good cluster |
2071 | | * @param Cluster cluster to be made into an elliptical prototype |
2072 | | * @param Statistics statistical info about cluster |
2073 | | * @return Pointer to new elliptical prototype or nullptr. |
2074 | | */ |
2075 | | static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, |
2076 | 0 | STATISTICS *Statistics) { |
2077 | | // Fraction of the number of samples used as a range around 1 within |
2078 | | // which a cluster has the magic size that allows a boost to the |
2079 | | // FTable by kFTableBoostMargin, thus allowing clusters near the |
2080 | | // magic size (equal to the number of sample characters) to be more |
2081 | | // likely to stay together. |
2082 | 0 | const double kMagicSampleMargin = 0.0625; |
2083 | 0 | const double kFTableBoostMargin = 2.0; |
2084 | |
|
2085 | 0 | int N = Clusterer->SampleSize; |
2086 | 0 | CLUSTER *Left = Cluster->Left; |
2087 | 0 | CLUSTER *Right = Cluster->Right; |
2088 | 0 | if (Left == nullptr || Right == nullptr) { |
2089 | 0 | return nullptr; |
2090 | 0 | } |
2091 | 0 | int TotalDims = Left->SampleCount + Right->SampleCount; |
2092 | 0 | if (TotalDims < N + 1 || TotalDims < 2) { |
2093 | 0 | return nullptr; |
2094 | 0 | } |
2095 | 0 | std::vector<float> Covariance(static_cast<size_t>(N) * N); |
2096 | 0 | std::vector<float> Inverse(static_cast<size_t>(N) * N); |
2097 | 0 | std::vector<float> Delta(N); |
2098 | | // Compute a new covariance matrix that only uses essential features. |
2099 | 0 | for (int i = 0; i < N; ++i) { |
2100 | 0 | int row_offset = i * N; |
2101 | 0 | if (!Clusterer->ParamDesc[i].NonEssential) { |
2102 | 0 | for (int j = 0; j < N; ++j) { |
2103 | 0 | if (!Clusterer->ParamDesc[j].NonEssential) { |
2104 | 0 | Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; |
2105 | 0 | } else { |
2106 | 0 | Covariance[j + row_offset] = 0.0f; |
2107 | 0 | } |
2108 | 0 | } |
2109 | 0 | } else { |
2110 | 0 | for (int j = 0; j < N; ++j) { |
2111 | 0 | if (i == j) { |
2112 | 0 | Covariance[j + row_offset] = 1.0f; |
2113 | 0 | } else { |
2114 | 0 | Covariance[j + row_offset] = 0.0f; |
2115 | 0 | } |
2116 | 0 | } |
2117 | 0 | } |
2118 | 0 | } |
2119 | 0 | double err = InvertMatrix(&Covariance[0], N, &Inverse[0]); |
2120 | 0 | if (err > 1) { |
2121 | 0 | tprintf("Clustering error: Matrix inverse failed with error %g\n", err); |
2122 | 0 | } |
2123 | 0 | int EssentialN = 0; |
2124 | 0 | for (int dim = 0; dim < N; ++dim) { |
2125 | 0 | if (!Clusterer->ParamDesc[dim].NonEssential) { |
2126 | 0 | Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; |
2127 | 0 | ++EssentialN; |
2128 | 0 | } else { |
2129 | 0 | Delta[dim] = 0.0f; |
2130 | 0 | } |
2131 | 0 | } |
2132 | | // Compute Hotelling's T-squared. |
2133 | 0 | double Tsq = 0.0; |
2134 | 0 | for (int x = 0; x < N; ++x) { |
2135 | 0 | double temp = 0.0; |
2136 | 0 | for (int y = 0; y < N; ++y) { |
2137 | 0 | temp += static_cast<double>(Inverse[y + N * x]) * Delta[y]; |
2138 | 0 | } |
2139 | 0 | Tsq += Delta[x] * temp; |
2140 | 0 | } |
2141 | | // Changed this function to match the formula in |
2142 | | // Statistical Methods in Medical Research p 473 |
2143 | | // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. |
2144 | | // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; |
2145 | 0 | double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN); |
2146 | 0 | int Fx = EssentialN; |
2147 | 0 | if (Fx > FTABLE_X) { |
2148 | 0 | Fx = FTABLE_X; |
2149 | 0 | } |
2150 | 0 | --Fx; |
2151 | 0 | int Fy = TotalDims - EssentialN - 1; |
2152 | 0 | if (Fy > FTABLE_Y) { |
2153 | 0 | Fy = FTABLE_Y; |
2154 | 0 | } |
2155 | 0 | --Fy; |
2156 | 0 | double FTarget = FTable[Fy][Fx]; |
2157 | 0 | if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && |
2158 | 0 | TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { |
2159 | | // Give magic-sized clusters a magic FTable boost. |
2160 | 0 | FTarget += kFTableBoostMargin; |
2161 | 0 | } |
2162 | 0 | if (F < FTarget) { |
2163 | 0 | return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); |
2164 | 0 | } |
2165 | 0 | return nullptr; |
2166 | 0 | } |
2167 | | |
2168 | | /** |
2169 | | * This routine tests the specified cluster to see if it can |
2170 | | * be approximated by a spherical normal distribution. If it |
2171 | | * can be, then a new prototype is formed and returned to the |
2172 | | * caller. If it can't be, then nullptr is returned to the caller. |
2173 | | * @param Clusterer data struct containing samples being clustered |
2174 | | * @param Cluster cluster to be made into a spherical prototype |
2175 | | * @param Statistics statistical info about cluster |
2176 | | * @param Buckets histogram struct used to analyze distribution |
2177 | | * @return Pointer to new spherical prototype or nullptr. |
2178 | | */ |
2179 | | static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, |
2180 | 0 | BUCKETS *Buckets) { |
2181 | 0 | PROTOTYPE *Proto = nullptr; |
2182 | 0 | int i; |
2183 | | |
2184 | | // check that each dimension is a normal distribution |
2185 | 0 | for (i = 0; i < Clusterer->SampleSize; i++) { |
2186 | 0 | if (Clusterer->ParamDesc[i].NonEssential) { |
2187 | 0 | continue; |
2188 | 0 | } |
2189 | | |
2190 | 0 | FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], |
2191 | 0 | sqrt(static_cast<double>(Statistics->AvgVariance))); |
2192 | 0 | if (!DistributionOK(Buckets)) { |
2193 | 0 | break; |
2194 | 0 | } |
2195 | 0 | } |
2196 | | // if all dimensions matched a normal distribution, make a proto |
2197 | 0 | if (i >= Clusterer->SampleSize) { |
2198 | 0 | Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics); |
2199 | 0 | } |
2200 | 0 | return (Proto); |
2201 | 0 | } // MakeSphericalProto |
2202 | | |
2203 | | /** |
2204 | | * This routine tests the specified cluster to see if it can |
2205 | | * be approximated by an elliptical normal distribution. If it |
2206 | | * can be, then a new prototype is formed and returned to the |
2207 | | * caller. If it can't be, then nullptr is returned to the caller. |
2208 | | * @param Clusterer data struct containing samples being clustered |
2209 | | * @param Cluster cluster to be made into an elliptical prototype |
2210 | | * @param Statistics statistical info about cluster |
2211 | | * @param Buckets histogram struct used to analyze distribution |
2212 | | * @return Pointer to new elliptical prototype or nullptr. |
2213 | | */ |
2214 | | static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, |
2215 | 0 | STATISTICS *Statistics, BUCKETS *Buckets) { |
2216 | 0 | PROTOTYPE *Proto = nullptr; |
2217 | 0 | int i; |
2218 | | |
2219 | | // check that each dimension is a normal distribution |
2220 | 0 | for (i = 0; i < Clusterer->SampleSize; i++) { |
2221 | 0 | if (Clusterer->ParamDesc[i].NonEssential) { |
2222 | 0 | continue; |
2223 | 0 | } |
2224 | | |
2225 | 0 | FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], |
2226 | 0 | sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)]))); |
2227 | 0 | if (!DistributionOK(Buckets)) { |
2228 | 0 | break; |
2229 | 0 | } |
2230 | 0 | } |
2231 | | // if all dimensions matched a normal distribution, make a proto |
2232 | 0 | if (i >= Clusterer->SampleSize) { |
2233 | 0 | Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); |
2234 | 0 | } |
2235 | 0 | return (Proto); |
2236 | 0 | } // MakeEllipticalProto |
2237 | | |
2238 | | /** |
2239 | | * This routine tests each dimension of the specified cluster to |
2240 | | * see what distribution would best approximate that dimension. |
2241 | | * Each dimension is compared to the following distributions |
2242 | | * in order: normal, random, uniform. If each dimension can |
2243 | | * be represented by one of these distributions, |
2244 | | * then a new prototype is formed and returned to the |
2245 | | * caller. If it can't be, then nullptr is returned to the caller. |
2246 | | * @param Clusterer data struct containing samples being clustered |
2247 | | * @param Cluster cluster to be made into a prototype |
2248 | | * @param Statistics statistical info about cluster |
2249 | | * @param NormalBuckets histogram struct used to analyze distribution |
2250 | | * @param Confidence confidence level for alternate distributions |
2251 | | * @return Pointer to new mixed prototype or nullptr. |
2252 | | */ |
2253 | | static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, |
2254 | 0 | BUCKETS *NormalBuckets, double Confidence) { |
2255 | 0 | PROTOTYPE *Proto; |
2256 | 0 | int i; |
2257 | 0 | BUCKETS *UniformBuckets = nullptr; |
2258 | 0 | BUCKETS *RandomBuckets = nullptr; |
2259 | | |
2260 | | // create a mixed proto to work on - initially assume all dimensions normal |
2261 | 0 | Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics); |
2262 | | |
2263 | | // find the proper distribution for each dimension |
2264 | 0 | for (i = 0; i < Clusterer->SampleSize; i++) { |
2265 | 0 | if (Clusterer->ParamDesc[i].NonEssential) { |
2266 | 0 | continue; |
2267 | 0 | } |
2268 | | |
2269 | 0 | FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], |
2270 | 0 | std::sqrt(Proto->Variance.Elliptical[i])); |
2271 | 0 | if (DistributionOK(NormalBuckets)) { |
2272 | 0 | continue; |
2273 | 0 | } |
2274 | | |
2275 | 0 | if (RandomBuckets == nullptr) { |
2276 | 0 | RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); |
2277 | 0 | } |
2278 | 0 | MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i])); |
2279 | 0 | FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], |
2280 | 0 | Proto->Variance.Elliptical[i]); |
2281 | 0 | if (DistributionOK(RandomBuckets)) { |
2282 | 0 | continue; |
2283 | 0 | } |
2284 | | |
2285 | 0 | if (UniformBuckets == nullptr) { |
2286 | 0 | UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); |
2287 | 0 | } |
2288 | 0 | MakeDimUniform(i, Proto, Statistics); |
2289 | 0 | FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], |
2290 | 0 | Proto->Variance.Elliptical[i]); |
2291 | 0 | if (DistributionOK(UniformBuckets)) { |
2292 | 0 | continue; |
2293 | 0 | } |
2294 | 0 | break; |
2295 | 0 | } |
2296 | | // if any dimension failed to match a distribution, discard the proto |
2297 | 0 | if (i < Clusterer->SampleSize) { |
2298 | 0 | FreePrototype(Proto); |
2299 | 0 | Proto = nullptr; |
2300 | 0 | } |
2301 | 0 | return (Proto); |
2302 | 0 | } // MakeMixedProto |
2303 | | |
2304 | | /** |
2305 | | * This routine alters the ith dimension of the specified |
2306 | | * mixed prototype to be D_random. |
2307 | | * @param i index of dimension to be changed |
2308 | | * @param Proto prototype whose dimension is to be altered |
2309 | | * @param ParamDesc description of specified dimension |
2310 | | */ |
2311 | 0 | static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { |
2312 | 0 | Proto->Distrib[i] = D_random; |
2313 | 0 | Proto->Mean[i] = ParamDesc->MidRange; |
2314 | 0 | Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; |
2315 | | |
2316 | | // subtract out the previous magnitude of this dimension from the total |
2317 | 0 | Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; |
2318 | 0 | Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; |
2319 | 0 | Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; |
2320 | 0 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
2321 | | |
2322 | | // note that the proto Weight is irrelevant for D_random protos |
2323 | 0 | } // MakeDimRandom |
2324 | | |
2325 | | /** |
2326 | | * This routine alters the ith dimension of the specified |
2327 | | * mixed prototype to be uniform. |
2328 | | * @param i index of dimension to be changed |
2329 | | * @param Proto prototype whose dimension is to be altered |
2330 | | * @param Statistics statistical info about prototype |
2331 | | */ |
2332 | 0 | static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) { |
2333 | 0 | Proto->Distrib[i] = uniform; |
2334 | 0 | Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2; |
2335 | 0 | Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2; |
2336 | 0 | if (Proto->Variance.Elliptical[i] < MINVARIANCE) { |
2337 | 0 | Proto->Variance.Elliptical[i] = MINVARIANCE; |
2338 | 0 | } |
2339 | | |
2340 | | // subtract out the previous magnitude of this dimension from the total |
2341 | 0 | Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; |
2342 | 0 | Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]); |
2343 | 0 | Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; |
2344 | 0 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
2345 | | |
2346 | | // note that the proto Weight is irrelevant for uniform protos |
2347 | 0 | } // MakeDimUniform |
2348 | | |
2349 | | /** |
2350 | | * This routine searches the cluster tree for all leaf nodes |
2351 | | * which are samples in the specified cluster. It computes |
2352 | | * a full covariance matrix for these samples as well as |
2353 | | * keeping track of the ranges (min and max) for each |
2354 | | * dimension. A special data structure is allocated to |
2355 | | * return this information to the caller. An incremental |
2356 | | * algorithm for computing statistics is not used because |
2357 | | * it will not work with circular dimensions. |
2358 | | * @param N number of dimensions |
2359 | | * @param ParamDesc array of dimension descriptions |
2360 | | * @param Cluster cluster whose stats are to be computed |
2361 | | * @return Pointer to new data structure containing statistics |
2362 | | */ |
2363 | 0 | static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) { |
2364 | 0 | int i, j; |
2365 | 0 | LIST SearchState; |
2366 | 0 | SAMPLE *Sample; |
2367 | 0 | uint32_t SampleCountAdjustedForBias; |
2368 | | |
2369 | | // allocate memory to hold the statistics results |
2370 | 0 | auto Statistics = new STATISTICS(N); |
2371 | | |
2372 | | // allocate temporary memory to hold the sample to mean distances |
2373 | 0 | std::vector<float> Distance(N); |
2374 | | |
2375 | | // find each sample in the cluster and merge it into the statistics |
2376 | 0 | InitSampleSearch(SearchState, Cluster); |
2377 | 0 | while ((Sample = NextSample(&SearchState)) != nullptr) { |
2378 | 0 | for (i = 0; i < N; i++) { |
2379 | 0 | Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; |
2380 | 0 | if (ParamDesc[i].Circular) { |
2381 | 0 | if (Distance[i] > ParamDesc[i].HalfRange) { |
2382 | 0 | Distance[i] -= ParamDesc[i].Range; |
2383 | 0 | } |
2384 | 0 | if (Distance[i] < -ParamDesc[i].HalfRange) { |
2385 | 0 | Distance[i] += ParamDesc[i].Range; |
2386 | 0 | } |
2387 | 0 | } |
2388 | 0 | if (Distance[i] < Statistics->Min[i]) { |
2389 | 0 | Statistics->Min[i] = Distance[i]; |
2390 | 0 | } |
2391 | 0 | if (Distance[i] > Statistics->Max[i]) { |
2392 | 0 | Statistics->Max[i] = Distance[i]; |
2393 | 0 | } |
2394 | 0 | } |
2395 | 0 | auto CoVariance = &Statistics->CoVariance[0]; |
2396 | 0 | for (i = 0; i < N; i++) { |
2397 | 0 | for (j = 0; j < N; j++, CoVariance++) { |
2398 | 0 | *CoVariance += Distance[i] * Distance[j]; |
2399 | 0 | } |
2400 | 0 | } |
2401 | 0 | } |
2402 | | // normalize the variances by the total number of samples |
2403 | | // use SampleCount-1 instead of SampleCount to get an unbiased estimate |
2404 | | // also compute the geometic mean of the diagonal variances |
2405 | | // ensure that clusters with only 1 sample are handled correctly |
2406 | 0 | if (Cluster->SampleCount > 1) { |
2407 | 0 | SampleCountAdjustedForBias = Cluster->SampleCount - 1; |
2408 | 0 | } else { |
2409 | 0 | SampleCountAdjustedForBias = 1; |
2410 | 0 | } |
2411 | 0 | auto CoVariance = &Statistics->CoVariance[0]; |
2412 | 0 | for (i = 0; i < N; i++) { |
2413 | 0 | for (j = 0; j < N; j++, CoVariance++) { |
2414 | 0 | *CoVariance /= SampleCountAdjustedForBias; |
2415 | 0 | if (j == i) { |
2416 | 0 | if (*CoVariance < MINVARIANCE) { |
2417 | 0 | *CoVariance = MINVARIANCE; |
2418 | 0 | } |
2419 | 0 | Statistics->AvgVariance *= *CoVariance; |
2420 | 0 | } |
2421 | 0 | } |
2422 | 0 | } |
2423 | 0 | Statistics->AvgVariance = |
2424 | 0 | static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N)); |
2425 | |
|
2426 | 0 | return Statistics; |
2427 | 0 | } // ComputeStatistics |
2428 | | |
2429 | | /** |
2430 | | * This routine creates a spherical prototype data structure to |
2431 | | * approximate the samples in the specified cluster. |
2432 | | * Spherical prototypes have a single variance which is |
2433 | | * common across all dimensions. All dimensions are normally |
2434 | | * distributed and independent. |
2435 | | * @param N number of dimensions |
2436 | | * @param Cluster cluster to be made into a spherical prototype |
2437 | | * @param Statistics statistical info about samples in cluster |
2438 | | * @return Pointer to a new spherical prototype data structure |
2439 | | */ |
2440 | 0 | static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { |
2441 | 0 | PROTOTYPE *Proto; |
2442 | |
|
2443 | 0 | Proto = NewSimpleProto(N, Cluster); |
2444 | |
|
2445 | 0 | Proto->Variance.Spherical = Statistics->AvgVariance; |
2446 | 0 | if (Proto->Variance.Spherical < MINVARIANCE) { |
2447 | 0 | Proto->Variance.Spherical = MINVARIANCE; |
2448 | 0 | } |
2449 | |
|
2450 | 0 | Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical); |
2451 | 0 | Proto->TotalMagnitude = static_cast<float>( |
2452 | 0 | pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N))); |
2453 | 0 | Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; |
2454 | 0 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
2455 | |
|
2456 | 0 | return (Proto); |
2457 | 0 | } // NewSphericalProto |
2458 | | |
2459 | | /** |
2460 | | * This routine creates an elliptical prototype data structure to |
2461 | | * approximate the samples in the specified cluster. |
2462 | | * Elliptical prototypes have a variance for each dimension. |
2463 | | * All dimensions are normally distributed and independent. |
2464 | | * @param N number of dimensions |
2465 | | * @param Cluster cluster to be made into an elliptical prototype |
2466 | | * @param Statistics statistical info about samples in cluster |
2467 | | * @return Pointer to a new elliptical prototype data structure |
2468 | | */ |
2469 | 0 | static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { |
2470 | 0 | PROTOTYPE *Proto; |
2471 | 0 | int i; |
2472 | |
|
2473 | 0 | Proto = NewSimpleProto(N, Cluster); |
2474 | 0 | Proto->Variance.Elliptical = new float[N]; |
2475 | 0 | Proto->Magnitude.Elliptical = new float[N]; |
2476 | 0 | Proto->Weight.Elliptical = new float[N]; |
2477 | |
|
2478 | 0 | auto CoVariance = &Statistics->CoVariance[0]; |
2479 | 0 | Proto->TotalMagnitude = 1.0; |
2480 | 0 | for (i = 0; i < N; i++, CoVariance += N + 1) { |
2481 | 0 | Proto->Variance.Elliptical[i] = *CoVariance; |
2482 | 0 | if (Proto->Variance.Elliptical[i] < MINVARIANCE) { |
2483 | 0 | Proto->Variance.Elliptical[i] = MINVARIANCE; |
2484 | 0 | } |
2485 | |
|
2486 | 0 | Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]); |
2487 | 0 | Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i]; |
2488 | 0 | Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; |
2489 | 0 | } |
2490 | 0 | Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); |
2491 | 0 | Proto->Style = elliptical; |
2492 | 0 | return (Proto); |
2493 | 0 | } // NewEllipticalProto |
2494 | | |
2495 | | /** |
2496 | | * This routine creates a mixed prototype data structure to |
2497 | | * approximate the samples in the specified cluster. |
2498 | | * Mixed prototypes can have different distributions for |
2499 | | * each dimension. All dimensions are independent. The |
2500 | | * structure is initially filled in as though it were an |
2501 | | * elliptical prototype. The actual distributions of the |
2502 | | * dimensions can be altered by other routines. |
2503 | | * @param N number of dimensions |
2504 | | * @param Cluster cluster to be made into a mixed prototype |
2505 | | * @param Statistics statistical info about samples in cluster |
2506 | | * @return Pointer to a new mixed prototype data structure |
2507 | | */ |
2508 | 0 | static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { |
2509 | 0 | auto Proto = NewEllipticalProto(N, Cluster, Statistics); |
2510 | 0 | Proto->Distrib.clear(); |
2511 | 0 | Proto->Distrib.resize(N, normal); |
2512 | 0 | Proto->Style = mixed; |
2513 | 0 | return Proto; |
2514 | 0 | } // NewMixedProto |
2515 | | |
2516 | | /** |
2517 | | * This routine allocates memory to hold a simple prototype |
2518 | | * data structure, i.e. one without independent distributions |
2519 | | * and variances for each dimension. |
2520 | | * @param N number of dimensions |
2521 | | * @param Cluster cluster to be made into a prototype |
2522 | | * @return Pointer to new simple prototype |
2523 | | */ |
2524 | 0 | static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) { |
2525 | 0 | auto Proto = new PROTOTYPE; |
2526 | 0 | Proto->Mean = Cluster->Mean; |
2527 | 0 | Proto->Distrib.clear(); |
2528 | 0 | Proto->Significant = true; |
2529 | 0 | Proto->Merged = false; |
2530 | 0 | Proto->Style = spherical; |
2531 | 0 | Proto->NumSamples = Cluster->SampleCount; |
2532 | 0 | Proto->Cluster = Cluster; |
2533 | 0 | Proto->Cluster->Prototype = true; |
2534 | 0 | return Proto; |
2535 | 0 | } // NewSimpleProto |
2536 | | |
2537 | | /** |
2538 | | * This routine returns true if the specified covariance |
2539 | | * matrix indicates that all N dimensions are independent of |
2540 | | * one another. One dimension is judged to be independent of |
2541 | | * another when the magnitude of the corresponding correlation |
2542 | | * coefficient is |
2543 | | * less than the specified Independence factor. The |
2544 | | * correlation coefficient is calculated as: (see Duda and |
2545 | | * Hart, pg. 247) |
2546 | | * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) |
2547 | | * The covariance matrix is assumed to be symmetric (which |
2548 | | * should always be true). |
2549 | | * @param ParamDesc descriptions of each feature space dimension |
2550 | | * @param N number of dimensions |
2551 | | * @param CoVariance ptr to a covariance matrix |
2552 | | * @param Independence max off-diagonal correlation coefficient |
2553 | | * @return true if dimensions are independent, false otherwise |
2554 | | */ |
2555 | 0 | static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) { |
2556 | 0 | int i, j; |
2557 | 0 | float *VARii; // points to ith on-diagonal element |
2558 | 0 | float *VARjj; // points to jth on-diagonal element |
2559 | 0 | float CorrelationCoeff; |
2560 | |
|
2561 | 0 | VARii = CoVariance; |
2562 | 0 | for (i = 0; i < N; i++, VARii += N + 1) { |
2563 | 0 | if (ParamDesc[i].NonEssential) { |
2564 | 0 | continue; |
2565 | 0 | } |
2566 | | |
2567 | 0 | VARjj = VARii + N + 1; |
2568 | 0 | CoVariance = VARii + 1; |
2569 | 0 | for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { |
2570 | 0 | if (ParamDesc[j].NonEssential) { |
2571 | 0 | continue; |
2572 | 0 | } |
2573 | | |
2574 | 0 | if ((*VARii == 0.0) || (*VARjj == 0.0)) { |
2575 | 0 | CorrelationCoeff = 0.0; |
2576 | 0 | } else { |
2577 | 0 | CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj))); |
2578 | 0 | } |
2579 | 0 | if (CorrelationCoeff > Independence) { |
2580 | 0 | return false; |
2581 | 0 | } |
2582 | 0 | } |
2583 | 0 | } |
2584 | 0 | return true; |
2585 | 0 | } // Independent |
2586 | | |
2587 | | /** |
2588 | | * This routine returns a histogram data structure which can |
2589 | | * be used by other routines to place samples into histogram |
2590 | | * buckets, and then apply a goodness of fit test to the |
2591 | | * histogram data to determine if the samples belong to the |
2592 | | * specified probability distribution. The routine keeps |
2593 | | * a list of bucket data structures which have already been |
2594 | | * created so that it minimizes the computation time needed |
2595 | | * to create a new bucket. |
2596 | | * @param clusterer which keeps a bucket_cache for us. |
2597 | | * @param Distribution type of probability distribution to test for |
2598 | | * @param SampleCount number of samples that are available |
2599 | | * @param Confidence probability of a Type I error |
2600 | | * @return Bucket data structure |
2601 | | */ |
2602 | | static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, |
2603 | 0 | double Confidence) { |
2604 | | // Get an old bucket structure with the same number of buckets. |
2605 | 0 | uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); |
2606 | 0 | BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; |
2607 | | |
2608 | | // If a matching bucket structure is not found, make one and save it. |
2609 | 0 | if (Buckets == nullptr) { |
2610 | 0 | Buckets = MakeBuckets(Distribution, SampleCount, Confidence); |
2611 | 0 | clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets; |
2612 | 0 | } else { |
2613 | | // Just adjust the existing buckets. |
2614 | 0 | if (SampleCount != Buckets->SampleCount) { |
2615 | 0 | AdjustBuckets(Buckets, SampleCount); |
2616 | 0 | } |
2617 | 0 | if (Confidence != Buckets->Confidence) { |
2618 | 0 | Buckets->Confidence = Confidence; |
2619 | 0 | Buckets->ChiSquared = |
2620 | 0 | ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); |
2621 | 0 | } |
2622 | 0 | InitBuckets(Buckets); |
2623 | 0 | } |
2624 | 0 | return Buckets; |
2625 | 0 | } // GetBuckets |
2626 | | |
2627 | | /** |
2628 | | * This routine creates a histogram data structure which can |
2629 | | * be used by other routines to place samples into histogram |
2630 | | * buckets, and then apply a goodness of fit test to the |
2631 | | * histogram data to determine if the samples belong to the |
2632 | | * specified probability distribution. The buckets are |
2633 | | * allocated in such a way that the expected frequency of |
2634 | | * samples in each bucket is approximately the same. In |
2635 | | * order to make this possible, a mapping table is |
2636 | | * computed which maps "normalized" samples into the |
2637 | | * appropriate bucket. |
2638 | | * @param Distribution type of probability distribution to test for |
2639 | | * @param SampleCount number of samples that are available |
2640 | | * @param Confidence probability of a Type I error |
2641 | | * @return Pointer to new histogram data structure |
2642 | | */ |
2643 | 0 | static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) { |
2644 | 0 | const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity}; |
2645 | 0 | int i, j; |
2646 | 0 | double BucketProbability; |
2647 | 0 | double NextBucketBoundary; |
2648 | 0 | double Probability; |
2649 | 0 | double ProbabilityDelta; |
2650 | 0 | double LastProbDensity; |
2651 | 0 | double ProbDensity; |
2652 | 0 | uint16_t CurrentBucket; |
2653 | 0 | bool Symmetrical; |
2654 | | |
2655 | | // allocate memory needed for data structure |
2656 | 0 | auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount)); |
2657 | 0 | Buckets->SampleCount = SampleCount; |
2658 | 0 | Buckets->Confidence = Confidence; |
2659 | | |
2660 | | // initialize simple fields |
2661 | 0 | Buckets->Distribution = Distribution; |
2662 | | |
2663 | | // all currently defined distributions are symmetrical |
2664 | 0 | Symmetrical = true; |
2665 | 0 | Buckets->ChiSquared = |
2666 | 0 | ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); |
2667 | |
|
2668 | 0 | if (Symmetrical) { |
2669 | | // allocate buckets so that all have approx. equal probability |
2670 | 0 | BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets); |
2671 | | |
2672 | | // distribution is symmetric so fill in upper half then copy |
2673 | 0 | CurrentBucket = Buckets->NumberOfBuckets / 2; |
2674 | 0 | if (Odd(Buckets->NumberOfBuckets)) { |
2675 | 0 | NextBucketBoundary = BucketProbability / 2; |
2676 | 0 | } else { |
2677 | 0 | NextBucketBoundary = BucketProbability; |
2678 | 0 | } |
2679 | |
|
2680 | 0 | Probability = 0.0; |
2681 | 0 | LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2); |
2682 | 0 | for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { |
2683 | 0 | ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1); |
2684 | 0 | ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0); |
2685 | 0 | Probability += ProbabilityDelta; |
2686 | 0 | if (Probability > NextBucketBoundary) { |
2687 | 0 | if (CurrentBucket < Buckets->NumberOfBuckets - 1) { |
2688 | 0 | CurrentBucket++; |
2689 | 0 | } |
2690 | 0 | NextBucketBoundary += BucketProbability; |
2691 | 0 | } |
2692 | 0 | Buckets->Bucket[i] = CurrentBucket; |
2693 | 0 | Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount); |
2694 | 0 | LastProbDensity = ProbDensity; |
2695 | 0 | } |
2696 | | // place any leftover probability into the last bucket |
2697 | 0 | Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount); |
2698 | | |
2699 | | // copy upper half of distribution to lower half |
2700 | 0 | for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) { |
2701 | 0 | Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); |
2702 | 0 | } |
2703 | | |
2704 | | // copy upper half of expected counts to lower half |
2705 | 0 | for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) { |
2706 | 0 | Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; |
2707 | 0 | } |
2708 | 0 | } |
2709 | 0 | return Buckets; |
2710 | 0 | } // MakeBuckets |
2711 | | |
2712 | | /** |
2713 | | * This routine computes the optimum number of histogram |
2714 | | * buckets that should be used in a chi-squared goodness of |
2715 | | * fit test for the specified number of samples. The optimum |
2716 | | * number is computed based on Table 4.1 on pg. 147 of |
2717 | | * "Measurement and Analysis of Random Data" by Bendat & Piersol. |
2718 | | * Linear interpolation is used to interpolate between table |
2719 | | * values. The table is intended for a 0.05 level of |
2720 | | * significance (alpha). This routine assumes that it is |
2721 | | * equally valid for other alpha's, which may not be true. |
2722 | | * @param SampleCount number of samples to be tested |
2723 | | * @return Optimum number of histogram buckets |
2724 | | */ |
2725 | 0 | static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) { |
2726 | 0 | uint8_t Last, Next; |
2727 | 0 | float Slope; |
2728 | |
|
2729 | 0 | if (SampleCount < kCountTable[0]) { |
2730 | 0 | return kBucketsTable[0]; |
2731 | 0 | } |
2732 | | |
2733 | 0 | for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { |
2734 | 0 | if (SampleCount <= kCountTable[Next]) { |
2735 | 0 | Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) / |
2736 | 0 | static_cast<float>(kCountTable[Next] - kCountTable[Last]); |
2737 | 0 | return ( |
2738 | 0 | static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last]))); |
2739 | 0 | } |
2740 | 0 | } |
2741 | 0 | return kBucketsTable[Last]; |
2742 | 0 | } // OptimumNumberOfBuckets |
2743 | | |
2744 | | /** |
2745 | | * This routine computes the chi-squared value which will |
2746 | | * leave a cumulative probability of Alpha in the right tail |
2747 | | * of a chi-squared distribution with the specified number of |
2748 | | * degrees of freedom. Alpha must be between 0 and 1. |
2749 | | * DegreesOfFreedom must be even. The routine maintains an |
2750 | | * array of lists. Each list corresponds to a different |
2751 | | * number of degrees of freedom. Each entry in the list |
2752 | | * corresponds to a different alpha value and its corresponding |
2753 | | * chi-squared value. Therefore, once a particular chi-squared |
2754 | | * value is computed, it is stored in the list and never |
2755 | | * needs to be computed again. |
2756 | | * @param DegreesOfFreedom determines shape of distribution |
2757 | | * @param Alpha probability of right tail |
2758 | | * @return Desired chi-squared value |
2759 | | */ |
2760 | | static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha) |
2761 | 0 | #define CHIACCURACY 0.01 |
2762 | 0 | #define MINALPHA (1e-200) |
2763 | 0 | { |
2764 | 0 | static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; |
2765 | | |
2766 | | // limit the minimum alpha that can be used - if alpha is too small |
2767 | | // it may not be possible to compute chi-squared. |
2768 | 0 | Alpha = ClipToRange(Alpha, MINALPHA, 1.0); |
2769 | 0 | if (Odd(DegreesOfFreedom)) { |
2770 | 0 | DegreesOfFreedom++; |
2771 | 0 | } |
2772 | | |
2773 | | /* find the list of chi-squared values which have already been computed |
2774 | | for the specified number of degrees of freedom. Search the list for |
2775 | | the desired chi-squared. */ |
2776 | 0 | CHISTRUCT SearchKey(0.0, Alpha); |
2777 | 0 | auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch); |
2778 | 0 | auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr); |
2779 | |
|
2780 | 0 | if (OldChiSquared == nullptr) { |
2781 | 0 | OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha); |
2782 | 0 | OldChiSquared->ChiSquared = |
2783 | 0 | Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY); |
2784 | 0 | ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared); |
2785 | 0 | } else { |
2786 | | // further optimization might move OldChiSquared to front of list |
2787 | 0 | } |
2788 | |
|
2789 | 0 | return (OldChiSquared->ChiSquared); |
2790 | |
|
2791 | 0 | } // ComputeChiSquared |
2792 | | |
2793 | | /** |
2794 | | * This routine computes the probability density function |
2795 | | * of a discrete normal distribution defined by the global |
2796 | | * variables kNormalMean, kNormalVariance, and kNormalMagnitude. |
2797 | | * Normal magnitude could, of course, be computed in terms of |
2798 | | * the normal variance but it is precomputed for efficiency. |
2799 | | * @param x number to compute the normal probability density for |
2800 | | * @note Globals: |
2801 | | * kNormalMean mean of a discrete normal distribution |
2802 | | * kNormalVariance variance of a discrete normal distribution |
2803 | | * kNormalMagnitude magnitude of a discrete normal distribution |
2804 | | * @return The value of the normal distribution at x. |
2805 | | */ |
2806 | 0 | static double NormalDensity(int32_t x) { |
2807 | 0 | double Distance; |
2808 | |
|
2809 | 0 | Distance = x - kNormalMean; |
2810 | 0 | return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); |
2811 | 0 | } // NormalDensity |
2812 | | |
2813 | | /** |
2814 | | * This routine computes the probability density function |
2815 | | * of a uniform distribution at the specified point. The |
2816 | | * range of the distribution is from 0 to BUCKETTABLESIZE. |
2817 | | * @param x number to compute the uniform probability density for |
2818 | | * @return The value of the uniform distribution at x. |
2819 | | */ |
2820 | 0 | static double UniformDensity(int32_t x) { |
2821 | 0 | constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE; |
2822 | |
|
2823 | 0 | if ((x >= 0) && (x <= BUCKETTABLESIZE)) { |
2824 | 0 | return UniformDistributionDensity; |
2825 | 0 | } else { |
2826 | 0 | return 0.0; |
2827 | 0 | } |
2828 | 0 | } // UniformDensity |
2829 | | |
2830 | | /** |
2831 | | * This routine computes a trapezoidal approximation to the |
2832 | | * integral of a function over a small delta in x. |
2833 | | * @param f1 value of function at x1 |
2834 | | * @param f2 value of function at x2 |
2835 | | * @param Dx x2 - x1 (should always be positive) |
2836 | | * @return Approximation of the integral of the function from x1 to x2. |
2837 | | */ |
2838 | 0 | static double Integral(double f1, double f2, double Dx) { |
2839 | 0 | return (f1 + f2) * Dx / 2.0; |
2840 | 0 | } // Integral |
2841 | | |
2842 | | /** |
2843 | | * This routine counts the number of cluster samples which |
2844 | | * fall within the various histogram buckets in Buckets. Only |
2845 | | * one dimension of each sample is examined. The exact meaning |
2846 | | * of the Mean and StdDev parameters depends on the |
2847 | | * distribution which is being analyzed (this info is in the |
2848 | | * Buckets data structure). For normal distributions, Mean |
2849 | | * and StdDev have the expected meanings. For uniform and |
2850 | | * random distributions the Mean is the center point of the |
2851 | | * range and the StdDev is 1/2 the range. A dimension with |
2852 | | * zero standard deviation cannot be statistically analyzed. |
2853 | | * In this case, a pseudo-analysis is used. |
2854 | | * The Buckets data structure is filled in. |
2855 | | * @param Buckets histogram buckets to count samples |
2856 | | * @param Cluster cluster whose samples are being analyzed |
2857 | | * @param Dim dimension of samples which is being analyzed |
2858 | | * @param ParamDesc description of the dimension |
2859 | | * @param Mean "mean" of the distribution |
2860 | | * @param StdDev "standard deviation" of the distribution |
2861 | | */ |
2862 | | static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, |
2863 | 0 | float Mean, float StdDev) { |
2864 | 0 | uint16_t BucketID; |
2865 | 0 | int i; |
2866 | 0 | LIST SearchState; |
2867 | 0 | SAMPLE *Sample; |
2868 | | |
2869 | | // initialize the histogram bucket counts to 0 |
2870 | 0 | for (i = 0; i < Buckets->NumberOfBuckets; i++) { |
2871 | 0 | Buckets->Count[i] = 0; |
2872 | 0 | } |
2873 | |
|
2874 | 0 | if (StdDev == 0.0) { |
2875 | | /* if the standard deviation is zero, then we can't statistically |
2876 | | analyze the cluster. Use a pseudo-analysis: samples exactly on |
2877 | | the mean are distributed evenly across all buckets. Samples greater |
2878 | | than the mean are placed in the last bucket; samples less than the |
2879 | | mean are placed in the first bucket. */ |
2880 | |
|
2881 | 0 | InitSampleSearch(SearchState, Cluster); |
2882 | 0 | i = 0; |
2883 | 0 | while ((Sample = NextSample(&SearchState)) != nullptr) { |
2884 | 0 | if (Sample->Mean[Dim] > Mean) { |
2885 | 0 | BucketID = Buckets->NumberOfBuckets - 1; |
2886 | 0 | } else if (Sample->Mean[Dim] < Mean) { |
2887 | 0 | BucketID = 0; |
2888 | 0 | } else { |
2889 | 0 | BucketID = i; |
2890 | 0 | } |
2891 | 0 | Buckets->Count[BucketID] += 1; |
2892 | 0 | i++; |
2893 | 0 | if (i >= Buckets->NumberOfBuckets) { |
2894 | 0 | i = 0; |
2895 | 0 | } |
2896 | 0 | } |
2897 | 0 | } else { |
2898 | | // search for all samples in the cluster and add to histogram buckets |
2899 | 0 | InitSampleSearch(SearchState, Cluster); |
2900 | 0 | while ((Sample = NextSample(&SearchState)) != nullptr) { |
2901 | 0 | switch (Buckets->Distribution) { |
2902 | 0 | case normal: |
2903 | 0 | BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); |
2904 | 0 | break; |
2905 | 0 | case D_random: |
2906 | 0 | case uniform: |
2907 | 0 | BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); |
2908 | 0 | break; |
2909 | 0 | default: |
2910 | 0 | BucketID = 0; |
2911 | 0 | } |
2912 | 0 | Buckets->Count[Buckets->Bucket[BucketID]] += 1; |
2913 | 0 | } |
2914 | 0 | } |
2915 | 0 | } // FillBuckets |
2916 | | |
2917 | | /** |
2918 | | * This routine determines which bucket x falls into in the |
2919 | | * discrete normal distribution defined by kNormalMean |
2920 | | * and kNormalStdDev. x values which exceed the range of |
2921 | | * the discrete distribution are clipped. |
2922 | | * @param ParamDesc used to identify circular dimensions |
2923 | | * @param x value to be normalized |
2924 | | * @param Mean mean of normal distribution |
2925 | | * @param StdDev standard deviation of normal distribution |
2926 | | * @return Bucket number into which x falls |
2927 | | */ |
2928 | 0 | static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { |
2929 | 0 | float X; |
2930 | | |
2931 | | // wraparound circular parameters if necessary |
2932 | 0 | if (ParamDesc->Circular) { |
2933 | 0 | if (x - Mean > ParamDesc->HalfRange) { |
2934 | 0 | x -= ParamDesc->Range; |
2935 | 0 | } else if (x - Mean < -ParamDesc->HalfRange) { |
2936 | 0 | x += ParamDesc->Range; |
2937 | 0 | } |
2938 | 0 | } |
2939 | |
|
2940 | 0 | X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; |
2941 | 0 | if (X < 0) { |
2942 | 0 | return 0; |
2943 | 0 | } |
2944 | 0 | if (X > BUCKETTABLESIZE - 1) { |
2945 | 0 | return (static_cast<uint16_t>(BUCKETTABLESIZE - 1)); |
2946 | 0 | } |
2947 | 0 | return static_cast<uint16_t>(floor(static_cast<double>(X))); |
2948 | 0 | } // NormalBucket |
2949 | | |
2950 | | /** |
2951 | | * This routine determines which bucket x falls into in the |
2952 | | * discrete uniform distribution defined by |
2953 | | * BUCKETTABLESIZE. x values which exceed the range of |
2954 | | * the discrete distribution are clipped. |
2955 | | * @param ParamDesc used to identify circular dimensions |
2956 | | * @param x value to be normalized |
2957 | | * @param Mean center of range of uniform distribution |
2958 | | * @param StdDev 1/2 the range of the uniform distribution |
2959 | | * @return Bucket number into which x falls |
2960 | | */ |
2961 | 0 | static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { |
2962 | 0 | float X; |
2963 | | |
2964 | | // wraparound circular parameters if necessary |
2965 | 0 | if (ParamDesc->Circular) { |
2966 | 0 | if (x - Mean > ParamDesc->HalfRange) { |
2967 | 0 | x -= ParamDesc->Range; |
2968 | 0 | } else if (x - Mean < -ParamDesc->HalfRange) { |
2969 | 0 | x += ParamDesc->Range; |
2970 | 0 | } |
2971 | 0 | } |
2972 | |
|
2973 | 0 | X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); |
2974 | 0 | if (X < 0) { |
2975 | 0 | return 0; |
2976 | 0 | } |
2977 | 0 | if (X > BUCKETTABLESIZE - 1) { |
2978 | 0 | return static_cast<uint16_t>(BUCKETTABLESIZE - 1); |
2979 | 0 | } |
2980 | 0 | return static_cast<uint16_t>(floor(static_cast<double>(X))); |
2981 | 0 | } // UniformBucket |
2982 | | |
2983 | | /** |
2984 | | * This routine performs a chi-square goodness of fit test |
2985 | | * on the histogram data in the Buckets data structure. |
2986 | | * true is returned if the histogram matches the probability |
2987 | | * distribution which was specified when the Buckets |
2988 | | * structure was originally created. Otherwise false is |
2989 | | * returned. |
2990 | | * @param Buckets histogram data to perform chi-square test on |
2991 | | * @return true if samples match distribution, false otherwise |
2992 | | */ |
2993 | 0 | static bool DistributionOK(BUCKETS *Buckets) { |
2994 | 0 | float FrequencyDifference; |
2995 | 0 | float TotalDifference; |
2996 | 0 | int i; |
2997 | | |
2998 | | // compute how well the histogram matches the expected histogram |
2999 | 0 | TotalDifference = 0.0; |
3000 | 0 | for (i = 0; i < Buckets->NumberOfBuckets; i++) { |
3001 | 0 | FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; |
3002 | 0 | TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i]; |
3003 | 0 | } |
3004 | | |
3005 | | // test to see if the difference is more than expected |
3006 | 0 | if (TotalDifference > Buckets->ChiSquared) { |
3007 | 0 | return false; |
3008 | 0 | } else { |
3009 | 0 | return true; |
3010 | 0 | } |
3011 | 0 | } // DistributionOK |
3012 | | |
3013 | | /** |
3014 | | * This routine computes the degrees of freedom that should |
3015 | | * be used in a chi-squared test with the specified number of |
3016 | | * histogram buckets. The result is always rounded up to |
3017 | | * the next even number so that the value of chi-squared can be |
3018 | | * computed more easily. This will cause the value of |
3019 | | * chi-squared to be higher than the optimum value, resulting |
3020 | | * in the chi-square test being more lenient than optimum. |
3021 | | * @param Distribution distribution being tested for |
3022 | | * @param HistogramBuckets number of buckets in chi-square test |
3023 | | * @return The number of degrees of freedom for a chi-square test |
3024 | | */ |
3025 | 0 | static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) { |
3026 | 0 | static uint8_t DegreeOffsets[] = {3, 3, 1}; |
3027 | |
|
3028 | 0 | uint16_t AdjustedNumBuckets; |
3029 | |
|
3030 | 0 | AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)]; |
3031 | 0 | if (Odd(AdjustedNumBuckets)) { |
3032 | 0 | AdjustedNumBuckets++; |
3033 | 0 | } |
3034 | 0 | return (AdjustedNumBuckets); |
3035 | |
|
3036 | 0 | } // DegreesOfFreedom |
3037 | | |
3038 | | /** |
3039 | | * This routine multiplies each ExpectedCount histogram entry |
3040 | | * by NewSampleCount/OldSampleCount so that the histogram |
3041 | | * is now adjusted to the new sample count. |
3042 | | * @param Buckets histogram data structure to adjust |
3043 | | * @param NewSampleCount new sample count to adjust to |
3044 | | */ |
3045 | 0 | static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) { |
3046 | 0 | int i; |
3047 | 0 | double AdjustFactor; |
3048 | |
|
3049 | 0 | AdjustFactor = |
3050 | 0 | ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount))); |
3051 | |
|
3052 | 0 | for (i = 0; i < Buckets->NumberOfBuckets; i++) { |
3053 | 0 | Buckets->ExpectedCount[i] *= AdjustFactor; |
3054 | 0 | } |
3055 | |
|
3056 | 0 | Buckets->SampleCount = NewSampleCount; |
3057 | |
|
3058 | 0 | } // AdjustBuckets |
3059 | | |
3060 | | /** |
3061 | | * This routine sets the bucket counts in the specified histogram |
3062 | | * to zero. |
3063 | | * @param Buckets histogram data structure to init |
3064 | | */ |
3065 | 0 | static void InitBuckets(BUCKETS *Buckets) { |
3066 | 0 | int i; |
3067 | |
|
3068 | 0 | for (i = 0; i < Buckets->NumberOfBuckets; i++) { |
3069 | 0 | Buckets->Count[i] = 0; |
3070 | 0 | } |
3071 | |
|
3072 | 0 | } // InitBuckets |
3073 | | |
3074 | | /** |
3075 | | * This routine is used to search a list of structures which |
3076 | | * hold pre-computed chi-squared values for a chi-squared |
3077 | | * value whose corresponding alpha field matches the alpha |
3078 | | * field of SearchKey. |
3079 | | * |
3080 | | * It is called by the list search routines. |
3081 | | * |
3082 | | * @param arg1 chi-squared struct being tested for a match |
3083 | | * @param arg2 chi-squared struct that is the search key |
3084 | | * @return true if ChiStruct's Alpha matches SearchKey's Alpha |
3085 | | */ |
3086 | | static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, |
3087 | 0 | void *arg2) { // CHISTRUCT *SearchKey) |
3088 | 0 | auto *ChiStruct = static_cast<CHISTRUCT *>(arg1); |
3089 | 0 | auto *SearchKey = static_cast<CHISTRUCT *>(arg2); |
3090 | |
|
3091 | 0 | return (ChiStruct->Alpha == SearchKey->Alpha); |
3092 | |
|
3093 | 0 | } // AlphaMatch |
3094 | | |
3095 | | /** |
3096 | | * This routine attempts to find an x value at which Function |
3097 | | * goes to zero (i.e. a root of the function). It will only |
3098 | | * work correctly if a solution actually exists and there |
3099 | | * are no extrema between the solution and the InitialGuess. |
3100 | | * The algorithms used are extremely primitive. |
3101 | | * |
3102 | | * @param Function function whose zero is to be found |
3103 | | * @param FunctionParams arbitrary data to pass to function |
3104 | | * @param InitialGuess point to start solution search at |
3105 | | * @param Accuracy maximum allowed error |
3106 | | * @return Solution of function (x for which f(x) = 0). |
3107 | | */ |
3108 | | static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy) |
3109 | 0 | #define INITIALDELTA 0.1 |
3110 | 0 | #define DELTARATIO 0.1 |
3111 | 0 | { |
3112 | 0 | double x; |
3113 | 0 | double f; |
3114 | 0 | double Slope; |
3115 | 0 | double Delta; |
3116 | 0 | double NewDelta; |
3117 | 0 | double xDelta; |
3118 | 0 | double LastPosX, LastNegX; |
3119 | |
|
3120 | 0 | x = InitialGuess; |
3121 | 0 | Delta = INITIALDELTA; |
3122 | 0 | LastPosX = FLT_MAX; |
3123 | 0 | LastNegX = -FLT_MAX; |
3124 | 0 | f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); |
3125 | 0 | while (Abs(LastPosX - LastNegX) > Accuracy) { |
3126 | | // keep track of outer bounds of current estimate |
3127 | 0 | if (f < 0) { |
3128 | 0 | LastNegX = x; |
3129 | 0 | } else { |
3130 | 0 | LastPosX = x; |
3131 | 0 | } |
3132 | | |
3133 | | // compute the approx. slope of f(x) at the current point |
3134 | 0 | Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta; |
3135 | | |
3136 | | // compute the next solution guess */ |
3137 | 0 | xDelta = f / Slope; |
3138 | 0 | x -= xDelta; |
3139 | | |
3140 | | // reduce the delta used for computing slope to be a fraction of |
3141 | | // the amount moved to get to the new guess |
3142 | 0 | NewDelta = Abs(xDelta) * DELTARATIO; |
3143 | 0 | if (NewDelta < Delta) { |
3144 | 0 | Delta = NewDelta; |
3145 | 0 | } |
3146 | | |
3147 | | // compute the value of the function at the new guess |
3148 | 0 | f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); |
3149 | 0 | } |
3150 | 0 | return (x); |
3151 | |
|
3152 | 0 | } // Solve |
3153 | | |
3154 | | /** |
3155 | | * This routine computes the area under a chi density curve |
3156 | | * from 0 to x, minus the desired area under the curve. The |
3157 | | * number of degrees of freedom of the chi curve is specified |
3158 | | * in the ChiParams structure. The desired area is also |
3159 | | * specified in the ChiParams structure as Alpha (or 1 minus |
3160 | | * the desired area). This routine is intended to be passed |
3161 | | * to the Solve() function to find the value of chi-squared |
3162 | | * which will yield a desired area under the right tail of |
3163 | | * the chi density curve. The function will only work for |
3164 | | * even degrees of freedom. The equations are based on |
3165 | | * integrating the chi density curve in parts to obtain |
3166 | | * a series that can be used to compute the area under the |
3167 | | * curve. |
3168 | | * @param ChiParams contains degrees of freedom and alpha |
3169 | | * @param x value of chi-squared to evaluate |
3170 | | * @return Error between actual and desired area under the chi curve. |
3171 | | */ |
3172 | 0 | static double ChiArea(CHISTRUCT *ChiParams, double x) { |
3173 | 0 | int i, N; |
3174 | 0 | double SeriesTotal; |
3175 | 0 | double Denominator; |
3176 | 0 | double PowerOfx; |
3177 | |
|
3178 | 0 | N = ChiParams->DegreesOfFreedom / 2 - 1; |
3179 | 0 | SeriesTotal = 1; |
3180 | 0 | Denominator = 1; |
3181 | 0 | PowerOfx = 1; |
3182 | 0 | for (i = 1; i <= N; i++) { |
3183 | 0 | Denominator *= 2 * i; |
3184 | 0 | PowerOfx *= x; |
3185 | 0 | SeriesTotal += PowerOfx / Denominator; |
3186 | 0 | } |
3187 | 0 | return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha); |
3188 | |
|
3189 | 0 | } // ChiArea |
3190 | | |
3191 | | /** |
3192 | | * This routine looks at all samples in the specified cluster. |
3193 | | * It computes a running estimate of the percentage of the |
3194 | | * characters which have more than 1 sample in the cluster. |
3195 | | * When this percentage exceeds MaxIllegal, true is returned. |
3196 | | * Otherwise false is returned. The CharID |
3197 | | * fields must contain integers which identify the training |
3198 | | * characters which were used to generate the sample. One |
3199 | | * integer is used for each sample. The NumChar field in |
3200 | | * the Clusterer must contain the number of characters in the |
3201 | | * training set. All CharID fields must be between 0 and |
3202 | | * NumChar-1. The main function of this routine is to help |
3203 | | * identify clusters which need to be split further, i.e. if |
3204 | | * numerous training characters have 2 or more features which are |
3205 | | * contained in the same cluster, then the cluster should be |
3206 | | * split. |
3207 | | * |
3208 | | * @param Clusterer data structure holding cluster tree |
3209 | | * @param Cluster cluster containing samples to be tested |
3210 | | * @param MaxIllegal max percentage of samples allowed to have |
3211 | | * more than 1 feature in the cluster |
3212 | | * @return true if the cluster should be split, false otherwise. |
3213 | | */ |
3214 | | static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal) |
3215 | 0 | #define ILLEGAL_CHAR 2 |
3216 | 0 | { |
3217 | 0 | static std::vector<uint8_t> CharFlags; |
3218 | 0 | LIST SearchState; |
3219 | 0 | SAMPLE *Sample; |
3220 | 0 | int32_t CharID; |
3221 | 0 | int32_t NumCharInCluster; |
3222 | 0 | int32_t NumIllegalInCluster; |
3223 | 0 | float PercentIllegal; |
3224 | | |
3225 | | // initial estimate assumes that no illegal chars exist in the cluster |
3226 | 0 | NumCharInCluster = Cluster->SampleCount; |
3227 | 0 | NumIllegalInCluster = 0; |
3228 | |
|
3229 | 0 | if (Clusterer->NumChar > CharFlags.size()) { |
3230 | 0 | CharFlags.resize(Clusterer->NumChar); |
3231 | 0 | } |
3232 | |
|
3233 | 0 | for (auto &CharFlag : CharFlags) { |
3234 | 0 | CharFlag = false; |
3235 | 0 | } |
3236 | | |
3237 | | // find each sample in the cluster and check if we have seen it before |
3238 | 0 | InitSampleSearch(SearchState, Cluster); |
3239 | 0 | while ((Sample = NextSample(&SearchState)) != nullptr) { |
3240 | 0 | CharID = Sample->CharID; |
3241 | 0 | if (CharFlags[CharID] == 0) { |
3242 | 0 | CharFlags[CharID] = true; |
3243 | 0 | } else { |
3244 | 0 | if (CharFlags[CharID] == 1) { |
3245 | 0 | NumIllegalInCluster++; |
3246 | 0 | CharFlags[CharID] = ILLEGAL_CHAR; |
3247 | 0 | } |
3248 | 0 | NumCharInCluster--; |
3249 | 0 | PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster; |
3250 | 0 | if (PercentIllegal > MaxIllegal) { |
3251 | 0 | destroy(SearchState); |
3252 | 0 | return true; |
3253 | 0 | } |
3254 | 0 | } |
3255 | 0 | } |
3256 | 0 | return false; |
3257 | |
|
3258 | 0 | } // MultipleCharSamples |
3259 | | |
3260 | | /** |
3261 | | * Compute the inverse of a matrix using LU decomposition with partial pivoting. |
3262 | | * The return value is the sum of norms of the off-diagonal terms of the |
3263 | | * product of a and inv. (A measure of the error.) |
3264 | | */ |
3265 | 0 | static double InvertMatrix(const float *input, int size, float *inv) { |
3266 | | // Allocate memory for the 2D arrays. |
3267 | 0 | GENERIC_2D_ARRAY<double> U(size, size, 0.0); |
3268 | 0 | GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); |
3269 | 0 | GENERIC_2D_ARRAY<double> L(size, size, 0.0); |
3270 | | |
3271 | | // Initialize the working matrices. U starts as input, L as I and U_inv as O. |
3272 | 0 | int row; |
3273 | 0 | int col; |
3274 | 0 | for (row = 0; row < size; row++) { |
3275 | 0 | for (col = 0; col < size; col++) { |
3276 | 0 | U[row][col] = input[row * size + col]; |
3277 | 0 | L[row][col] = row == col ? 1.0 : 0.0; |
3278 | 0 | U_inv[row][col] = 0.0; |
3279 | 0 | } |
3280 | 0 | } |
3281 | | |
3282 | | // Compute forward matrix by inversion by LU decomposition of input. |
3283 | 0 | for (col = 0; col < size; ++col) { |
3284 | | // Find best pivot |
3285 | 0 | int best_row = 0; |
3286 | 0 | double best_pivot = -1.0; |
3287 | 0 | for (row = col; row < size; ++row) { |
3288 | 0 | if (Abs(U[row][col]) > best_pivot) { |
3289 | 0 | best_pivot = Abs(U[row][col]); |
3290 | 0 | best_row = row; |
3291 | 0 | } |
3292 | 0 | } |
3293 | | // Exchange pivot rows. |
3294 | 0 | if (best_row != col) { |
3295 | 0 | for (int k = 0; k < size; ++k) { |
3296 | 0 | double tmp = U[best_row][k]; |
3297 | 0 | U[best_row][k] = U[col][k]; |
3298 | 0 | U[col][k] = tmp; |
3299 | 0 | tmp = L[best_row][k]; |
3300 | 0 | L[best_row][k] = L[col][k]; |
3301 | 0 | L[col][k] = tmp; |
3302 | 0 | } |
3303 | 0 | } |
3304 | | // Now do the pivot itself. |
3305 | 0 | for (row = col + 1; row < size; ++row) { |
3306 | 0 | double ratio = -U[row][col] / U[col][col]; |
3307 | 0 | for (int j = col; j < size; ++j) { |
3308 | 0 | U[row][j] += U[col][j] * ratio; |
3309 | 0 | } |
3310 | 0 | for (int k = 0; k < size; ++k) { |
3311 | 0 | L[row][k] += L[col][k] * ratio; |
3312 | 0 | } |
3313 | 0 | } |
3314 | 0 | } |
3315 | | // Next invert U. |
3316 | 0 | for (col = 0; col < size; ++col) { |
3317 | 0 | U_inv[col][col] = 1.0 / U[col][col]; |
3318 | 0 | for (row = col - 1; row >= 0; --row) { |
3319 | 0 | double total = 0.0; |
3320 | 0 | for (int k = col; k > row; --k) { |
3321 | 0 | total += U[row][k] * U_inv[k][col]; |
3322 | 0 | } |
3323 | 0 | U_inv[row][col] = -total / U[row][row]; |
3324 | 0 | } |
3325 | 0 | } |
3326 | | // Now the answer is U_inv.L. |
3327 | 0 | for (row = 0; row < size; row++) { |
3328 | 0 | for (col = 0; col < size; col++) { |
3329 | 0 | double sum = 0.0; |
3330 | 0 | for (int k = row; k < size; ++k) { |
3331 | 0 | sum += U_inv[row][k] * L[k][col]; |
3332 | 0 | } |
3333 | 0 | inv[row * size + col] = sum; |
3334 | 0 | } |
3335 | 0 | } |
3336 | | // Check matrix product. |
3337 | 0 | double error_sum = 0.0; |
3338 | 0 | for (row = 0; row < size; row++) { |
3339 | 0 | for (col = 0; col < size; col++) { |
3340 | 0 | double sum = 0.0; |
3341 | 0 | for (int k = 0; k < size; ++k) { |
3342 | 0 | sum += static_cast<double>(input[row * size + k]) * inv[k * size + col]; |
3343 | 0 | } |
3344 | 0 | if (row != col) { |
3345 | 0 | error_sum += Abs(sum); |
3346 | 0 | } |
3347 | 0 | } |
3348 | 0 | } |
3349 | 0 | return error_sum; |
3350 | 0 | } |
3351 | | |
3352 | | } // namespace tesseract |