Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/classify/cluster.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 ** Filename: cluster.cpp
3
 ** Purpose:  Routines for clustering points in N-D space
4
 ** Author:   Dan Johnson
5
 **
6
 ** (c) Copyright Hewlett-Packard Company, 1988.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *****************************************************************************/
17
18
#define _USE_MATH_DEFINES // for M_PI
19
20
#include "cluster.h"
21
22
#include "genericheap.h"
23
#include "kdpair.h"
24
#include "matrix.h"
25
#include "tprintf.h"
26
27
#include "helpers.h"
28
29
#include <cfloat> // for FLT_MAX
30
#include <cmath>  // for M_PI
31
#include <vector> // for std::vector
32
33
namespace tesseract {
34
35
0
#define HOTELLING 1  // If true use Hotelling's test to decide where to split.
36
0
#define FTABLE_X 10  // Size of FTable.
37
0
#define FTABLE_Y 100 // Size of FTable.
38
39
// Table of values approximating the cumulative F-distribution for a confidence
40
// of 1%.
41
const double FTable[FTABLE_Y][FTABLE_X] = {
42
    {
43
        4052.19,
44
        4999.52,
45
        5403.34,
46
        5624.62,
47
        5763.65,
48
        5858.97,
49
        5928.33,
50
        5981.10,
51
        6022.50,
52
        6055.85,
53
    },
54
    {
55
        98.502,
56
        99.000,
57
        99.166,
58
        99.249,
59
        99.300,
60
        99.333,
61
        99.356,
62
        99.374,
63
        99.388,
64
        99.399,
65
    },
66
    {
67
        34.116,
68
        30.816,
69
        29.457,
70
        28.710,
71
        28.237,
72
        27.911,
73
        27.672,
74
        27.489,
75
        27.345,
76
        27.229,
77
    },
78
    {
79
        21.198,
80
        18.000,
81
        16.694,
82
        15.977,
83
        15.522,
84
        15.207,
85
        14.976,
86
        14.799,
87
        14.659,
88
        14.546,
89
    },
90
    {
91
        16.258,
92
        13.274,
93
        12.060,
94
        11.392,
95
        10.967,
96
        10.672,
97
        10.456,
98
        10.289,
99
        10.158,
100
        10.051,
101
    },
102
    {
103
        13.745,
104
        10.925,
105
        9.780,
106
        9.148,
107
        8.746,
108
        8.466,
109
        8.260,
110
        8.102,
111
        7.976,
112
        7.874,
113
    },
114
    {
115
        12.246,
116
        9.547,
117
        8.451,
118
        7.847,
119
        7.460,
120
        7.191,
121
        6.993,
122
        6.840,
123
        6.719,
124
        6.620,
125
    },
126
    {
127
        11.259,
128
        8.649,
129
        7.591,
130
        7.006,
131
        6.632,
132
        6.371,
133
        6.178,
134
        6.029,
135
        5.911,
136
        5.814,
137
    },
138
    {
139
        10.561,
140
        8.022,
141
        6.992,
142
        6.422,
143
        6.057,
144
        5.802,
145
        5.613,
146
        5.467,
147
        5.351,
148
        5.257,
149
    },
150
    {
151
        10.044,
152
        7.559,
153
        6.552,
154
        5.994,
155
        5.636,
156
        5.386,
157
        5.200,
158
        5.057,
159
        4.942,
160
        4.849,
161
    },
162
    {
163
        9.646,
164
        7.206,
165
        6.217,
166
        5.668,
167
        5.316,
168
        5.069,
169
        4.886,
170
        4.744,
171
        4.632,
172
        4.539,
173
    },
174
    {
175
        9.330,
176
        6.927,
177
        5.953,
178
        5.412,
179
        5.064,
180
        4.821,
181
        4.640,
182
        4.499,
183
        4.388,
184
        4.296,
185
    },
186
    {
187
        9.074,
188
        6.701,
189
        5.739,
190
        5.205,
191
        4.862,
192
        4.620,
193
        4.441,
194
        4.302,
195
        4.191,
196
        4.100,
197
    },
198
    {
199
        8.862,
200
        6.515,
201
        5.564,
202
        5.035,
203
        4.695,
204
        4.456,
205
        4.278,
206
        4.140,
207
        4.030,
208
        3.939,
209
    },
210
    {
211
        8.683,
212
        6.359,
213
        5.417,
214
        4.893,
215
        4.556,
216
        4.318,
217
        4.142,
218
        4.004,
219
        3.895,
220
        3.805,
221
    },
222
    {
223
        8.531,
224
        6.226,
225
        5.292,
226
        4.773,
227
        4.437,
228
        4.202,
229
        4.026,
230
        3.890,
231
        3.780,
232
        3.691,
233
    },
234
    {
235
        8.400,
236
        6.112,
237
        5.185,
238
        4.669,
239
        4.336,
240
        4.102,
241
        3.927,
242
        3.791,
243
        3.682,
244
        3.593,
245
    },
246
    {
247
        8.285,
248
        6.013,
249
        5.092,
250
        4.579,
251
        4.248,
252
        4.015,
253
        3.841,
254
        3.705,
255
        3.597,
256
        3.508,
257
    },
258
    {
259
        8.185,
260
        5.926,
261
        5.010,
262
        4.500,
263
        4.171,
264
        3.939,
265
        3.765,
266
        3.631,
267
        3.523,
268
        3.434,
269
    },
270
    {
271
        8.096,
272
        5.849,
273
        4.938,
274
        4.431,
275
        4.103,
276
        3.871,
277
        3.699,
278
        3.564,
279
        3.457,
280
        3.368,
281
    },
282
    {
283
        8.017,
284
        5.780,
285
        4.874,
286
        4.369,
287
        4.042,
288
        3.812,
289
        3.640,
290
        3.506,
291
        3.398,
292
        3.310,
293
    },
294
    {
295
        7.945,
296
        5.719,
297
        4.817,
298
        4.313,
299
        3.988,
300
        3.758,
301
        3.587,
302
        3.453,
303
        3.346,
304
        3.258,
305
    },
306
    {
307
        7.881,
308
        5.664,
309
        4.765,
310
        4.264,
311
        3.939,
312
        3.710,
313
        3.539,
314
        3.406,
315
        3.299,
316
        3.211,
317
    },
318
    {
319
        7.823,
320
        5.614,
321
        4.718,
322
        4.218,
323
        3.895,
324
        3.667,
325
        3.496,
326
        3.363,
327
        3.256,
328
        3.168,
329
    },
330
    {
331
        7.770,
332
        5.568,
333
        4.675,
334
        4.177,
335
        3.855,
336
        3.627,
337
        3.457,
338
        3.324,
339
        3.217,
340
        3.129,
341
    },
342
    {
343
        7.721,
344
        5.526,
345
        4.637,
346
        4.140,
347
        3.818,
348
        3.591,
349
        3.421,
350
        3.288,
351
        3.182,
352
        3.094,
353
    },
354
    {
355
        7.677,
356
        5.488,
357
        4.601,
358
        4.106,
359
        3.785,
360
        3.558,
361
        3.388,
362
        3.256,
363
        3.149,
364
        3.062,
365
    },
366
    {
367
        7.636,
368
        5.453,
369
        4.568,
370
        4.074,
371
        3.754,
372
        3.528,
373
        3.358,
374
        3.226,
375
        3.120,
376
        3.032,
377
    },
378
    {
379
        7.598,
380
        5.420,
381
        4.538,
382
        4.045,
383
        3.725,
384
        3.499,
385
        3.330,
386
        3.198,
387
        3.092,
388
        3.005,
389
    },
390
    {
391
        7.562,
392
        5.390,
393
        4.510,
394
        4.018,
395
        3.699,
396
        3.473,
397
        3.305,
398
        3.173,
399
        3.067,
400
        2.979,
401
    },
402
    {
403
        7.530,
404
        5.362,
405
        4.484,
406
        3.993,
407
        3.675,
408
        3.449,
409
        3.281,
410
        3.149,
411
        3.043,
412
        2.955,
413
    },
414
    {
415
        7.499,
416
        5.336,
417
        4.459,
418
        3.969,
419
        3.652,
420
        3.427,
421
        3.258,
422
        3.127,
423
        3.021,
424
        2.934,
425
    },
426
    {
427
        7.471,
428
        5.312,
429
        4.437,
430
        3.948,
431
        3.630,
432
        3.406,
433
        3.238,
434
        3.106,
435
        3.000,
436
        2.913,
437
    },
438
    {
439
        7.444,
440
        5.289,
441
        4.416,
442
        3.927,
443
        3.611,
444
        3.386,
445
        3.218,
446
        3.087,
447
        2.981,
448
        2.894,
449
    },
450
    {
451
        7.419,
452
        5.268,
453
        4.396,
454
        3.908,
455
        3.592,
456
        3.368,
457
        3.200,
458
        3.069,
459
        2.963,
460
        2.876,
461
    },
462
    {
463
        7.396,
464
        5.248,
465
        4.377,
466
        3.890,
467
        3.574,
468
        3.351,
469
        3.183,
470
        3.052,
471
        2.946,
472
        2.859,
473
    },
474
    {
475
        7.373,
476
        5.229,
477
        4.360,
478
        3.873,
479
        3.558,
480
        3.334,
481
        3.167,
482
        3.036,
483
        2.930,
484
        2.843,
485
    },
486
    {
487
        7.353,
488
        5.211,
489
        4.343,
490
        3.858,
491
        3.542,
492
        3.319,
493
        3.152,
494
        3.021,
495
        2.915,
496
        2.828,
497
    },
498
    {
499
        7.333,
500
        5.194,
501
        4.327,
502
        3.843,
503
        3.528,
504
        3.305,
505
        3.137,
506
        3.006,
507
        2.901,
508
        2.814,
509
    },
510
    {
511
        7.314,
512
        5.179,
513
        4.313,
514
        3.828,
515
        3.514,
516
        3.291,
517
        3.124,
518
        2.993,
519
        2.888,
520
        2.801,
521
    },
522
    {
523
        7.296,
524
        5.163,
525
        4.299,
526
        3.815,
527
        3.501,
528
        3.278,
529
        3.111,
530
        2.980,
531
        2.875,
532
        2.788,
533
    },
534
    {
535
        7.280,
536
        5.149,
537
        4.285,
538
        3.802,
539
        3.488,
540
        3.266,
541
        3.099,
542
        2.968,
543
        2.863,
544
        2.776,
545
    },
546
    {
547
        7.264,
548
        5.136,
549
        4.273,
550
        3.790,
551
        3.476,
552
        3.254,
553
        3.087,
554
        2.957,
555
        2.851,
556
        2.764,
557
    },
558
    {
559
        7.248,
560
        5.123,
561
        4.261,
562
        3.778,
563
        3.465,
564
        3.243,
565
        3.076,
566
        2.946,
567
        2.840,
568
        2.754,
569
    },
570
    {
571
        7.234,
572
        5.110,
573
        4.249,
574
        3.767,
575
        3.454,
576
        3.232,
577
        3.066,
578
        2.935,
579
        2.830,
580
        2.743,
581
    },
582
    {
583
        7.220,
584
        5.099,
585
        4.238,
586
        3.757,
587
        3.444,
588
        3.222,
589
        3.056,
590
        2.925,
591
        2.820,
592
        2.733,
593
    },
594
    {
595
        7.207,
596
        5.087,
597
        4.228,
598
        3.747,
599
        3.434,
600
        3.213,
601
        3.046,
602
        2.916,
603
        2.811,
604
        2.724,
605
    },
606
    {
607
        7.194,
608
        5.077,
609
        4.218,
610
        3.737,
611
        3.425,
612
        3.204,
613
        3.037,
614
        2.907,
615
        2.802,
616
        2.715,
617
    },
618
    {
619
        7.182,
620
        5.066,
621
        4.208,
622
        3.728,
623
        3.416,
624
        3.195,
625
        3.028,
626
        2.898,
627
        2.793,
628
        2.706,
629
    },
630
    {
631
        7.171,
632
        5.057,
633
        4.199,
634
        3.720,
635
        3.408,
636
        3.186,
637
        3.020,
638
        2.890,
639
        2.785,
640
        2.698,
641
    },
642
    {
643
        7.159,
644
        5.047,
645
        4.191,
646
        3.711,
647
        3.400,
648
        3.178,
649
        3.012,
650
        2.882,
651
        2.777,
652
        2.690,
653
    },
654
    {
655
        7.149,
656
        5.038,
657
        4.182,
658
        3.703,
659
        3.392,
660
        3.171,
661
        3.005,
662
        2.874,
663
        2.769,
664
        2.683,
665
    },
666
    {
667
        7.139,
668
        5.030,
669
        4.174,
670
        3.695,
671
        3.384,
672
        3.163,
673
        2.997,
674
        2.867,
675
        2.762,
676
        2.675,
677
    },
678
    {
679
        7.129,
680
        5.021,
681
        4.167,
682
        3.688,
683
        3.377,
684
        3.156,
685
        2.990,
686
        2.860,
687
        2.755,
688
        2.668,
689
    },
690
    {
691
        7.119,
692
        5.013,
693
        4.159,
694
        3.681,
695
        3.370,
696
        3.149,
697
        2.983,
698
        2.853,
699
        2.748,
700
        2.662,
701
    },
702
    {
703
        7.110,
704
        5.006,
705
        4.152,
706
        3.674,
707
        3.363,
708
        3.143,
709
        2.977,
710
        2.847,
711
        2.742,
712
        2.655,
713
    },
714
    {
715
        7.102,
716
        4.998,
717
        4.145,
718
        3.667,
719
        3.357,
720
        3.136,
721
        2.971,
722
        2.841,
723
        2.736,
724
        2.649,
725
    },
726
    {
727
        7.093,
728
        4.991,
729
        4.138,
730
        3.661,
731
        3.351,
732
        3.130,
733
        2.965,
734
        2.835,
735
        2.730,
736
        2.643,
737
    },
738
    {
739
        7.085,
740
        4.984,
741
        4.132,
742
        3.655,
743
        3.345,
744
        3.124,
745
        2.959,
746
        2.829,
747
        2.724,
748
        2.637,
749
    },
750
    {
751
        7.077,
752
        4.977,
753
        4.126,
754
        3.649,
755
        3.339,
756
        3.119,
757
        2.953,
758
        2.823,
759
        2.718,
760
        2.632,
761
    },
762
    {
763
        7.070,
764
        4.971,
765
        4.120,
766
        3.643,
767
        3.333,
768
        3.113,
769
        2.948,
770
        2.818,
771
        2.713,
772
        2.626,
773
    },
774
    {
775
        7.062,
776
        4.965,
777
        4.114,
778
        3.638,
779
        3.328,
780
        3.108,
781
        2.942,
782
        2.813,
783
        2.708,
784
        2.621,
785
    },
786
    {
787
        7.055,
788
        4.959,
789
        4.109,
790
        3.632,
791
        3.323,
792
        3.103,
793
        2.937,
794
        2.808,
795
        2.703,
796
        2.616,
797
    },
798
    {
799
        7.048,
800
        4.953,
801
        4.103,
802
        3.627,
803
        3.318,
804
        3.098,
805
        2.932,
806
        2.803,
807
        2.698,
808
        2.611,
809
    },
810
    {
811
        7.042,
812
        4.947,
813
        4.098,
814
        3.622,
815
        3.313,
816
        3.093,
817
        2.928,
818
        2.798,
819
        2.693,
820
        2.607,
821
    },
822
    {
823
        7.035,
824
        4.942,
825
        4.093,
826
        3.618,
827
        3.308,
828
        3.088,
829
        2.923,
830
        2.793,
831
        2.689,
832
        2.602,
833
    },
834
    {
835
        7.029,
836
        4.937,
837
        4.088,
838
        3.613,
839
        3.304,
840
        3.084,
841
        2.919,
842
        2.789,
843
        2.684,
844
        2.598,
845
    },
846
    {
847
        7.023,
848
        4.932,
849
        4.083,
850
        3.608,
851
        3.299,
852
        3.080,
853
        2.914,
854
        2.785,
855
        2.680,
856
        2.593,
857
    },
858
    {
859
        7.017,
860
        4.927,
861
        4.079,
862
        3.604,
863
        3.295,
864
        3.075,
865
        2.910,
866
        2.781,
867
        2.676,
868
        2.589,
869
    },
870
    {
871
        7.011,
872
        4.922,
873
        4.074,
874
        3.600,
875
        3.291,
876
        3.071,
877
        2.906,
878
        2.777,
879
        2.672,
880
        2.585,
881
    },
882
    {
883
        7.006,
884
        4.917,
885
        4.070,
886
        3.596,
887
        3.287,
888
        3.067,
889
        2.902,
890
        2.773,
891
        2.668,
892
        2.581,
893
    },
894
    {
895
        7.001,
896
        4.913,
897
        4.066,
898
        3.591,
899
        3.283,
900
        3.063,
901
        2.898,
902
        2.769,
903
        2.664,
904
        2.578,
905
    },
906
    {
907
        6.995,
908
        4.908,
909
        4.062,
910
        3.588,
911
        3.279,
912
        3.060,
913
        2.895,
914
        2.765,
915
        2.660,
916
        2.574,
917
    },
918
    {
919
        6.990,
920
        4.904,
921
        4.058,
922
        3.584,
923
        3.275,
924
        3.056,
925
        2.891,
926
        2.762,
927
        2.657,
928
        2.570,
929
    },
930
    {
931
        6.985,
932
        4.900,
933
        4.054,
934
        3.580,
935
        3.272,
936
        3.052,
937
        2.887,
938
        2.758,
939
        2.653,
940
        2.567,
941
    },
942
    {
943
        6.981,
944
        4.896,
945
        4.050,
946
        3.577,
947
        3.268,
948
        3.049,
949
        2.884,
950
        2.755,
951
        2.650,
952
        2.563,
953
    },
954
    {
955
        6.976,
956
        4.892,
957
        4.047,
958
        3.573,
959
        3.265,
960
        3.046,
961
        2.881,
962
        2.751,
963
        2.647,
964
        2.560,
965
    },
966
    {
967
        6.971,
968
        4.888,
969
        4.043,
970
        3.570,
971
        3.261,
972
        3.042,
973
        2.877,
974
        2.748,
975
        2.644,
976
        2.557,
977
    },
978
    {
979
        6.967,
980
        4.884,
981
        4.040,
982
        3.566,
983
        3.258,
984
        3.039,
985
        2.874,
986
        2.745,
987
        2.640,
988
        2.554,
989
    },
990
    {
991
        6.963,
992
        4.881,
993
        4.036,
994
        3.563,
995
        3.255,
996
        3.036,
997
        2.871,
998
        2.742,
999
        2.637,
1000
        2.551,
1001
    },
1002
    {
1003
        6.958,
1004
        4.877,
1005
        4.033,
1006
        3.560,
1007
        3.252,
1008
        3.033,
1009
        2.868,
1010
        2.739,
1011
        2.634,
1012
        2.548,
1013
    },
1014
    {
1015
        6.954,
1016
        4.874,
1017
        4.030,
1018
        3.557,
1019
        3.249,
1020
        3.030,
1021
        2.865,
1022
        2.736,
1023
        2.632,
1024
        2.545,
1025
    },
1026
    {
1027
        6.950,
1028
        4.870,
1029
        4.027,
1030
        3.554,
1031
        3.246,
1032
        3.027,
1033
        2.863,
1034
        2.733,
1035
        2.629,
1036
        2.542,
1037
    },
1038
    {
1039
        6.947,
1040
        4.867,
1041
        4.024,
1042
        3.551,
1043
        3.243,
1044
        3.025,
1045
        2.860,
1046
        2.731,
1047
        2.626,
1048
        2.539,
1049
    },
1050
    {
1051
        6.943,
1052
        4.864,
1053
        4.021,
1054
        3.548,
1055
        3.240,
1056
        3.022,
1057
        2.857,
1058
        2.728,
1059
        2.623,
1060
        2.537,
1061
    },
1062
    {
1063
        6.939,
1064
        4.861,
1065
        4.018,
1066
        3.545,
1067
        3.238,
1068
        3.019,
1069
        2.854,
1070
        2.725,
1071
        2.621,
1072
        2.534,
1073
    },
1074
    {
1075
        6.935,
1076
        4.858,
1077
        4.015,
1078
        3.543,
1079
        3.235,
1080
        3.017,
1081
        2.852,
1082
        2.723,
1083
        2.618,
1084
        2.532,
1085
    },
1086
    {
1087
        6.932,
1088
        4.855,
1089
        4.012,
1090
        3.540,
1091
        3.233,
1092
        3.014,
1093
        2.849,
1094
        2.720,
1095
        2.616,
1096
        2.529,
1097
    },
1098
    {
1099
        6.928,
1100
        4.852,
1101
        4.010,
1102
        3.538,
1103
        3.230,
1104
        3.012,
1105
        2.847,
1106
        2.718,
1107
        2.613,
1108
        2.527,
1109
    },
1110
    {
1111
        6.925,
1112
        4.849,
1113
        4.007,
1114
        3.535,
1115
        3.228,
1116
        3.009,
1117
        2.845,
1118
        2.715,
1119
        2.611,
1120
        2.524,
1121
    },
1122
    {
1123
        6.922,
1124
        4.846,
1125
        4.004,
1126
        3.533,
1127
        3.225,
1128
        3.007,
1129
        2.842,
1130
        2.713,
1131
        2.609,
1132
        2.522,
1133
    },
1134
    {
1135
        6.919,
1136
        4.844,
1137
        4.002,
1138
        3.530,
1139
        3.223,
1140
        3.004,
1141
        2.840,
1142
        2.711,
1143
        2.606,
1144
        2.520,
1145
    },
1146
    {
1147
        6.915,
1148
        4.841,
1149
        3.999,
1150
        3.528,
1151
        3.221,
1152
        3.002,
1153
        2.838,
1154
        2.709,
1155
        2.604,
1156
        2.518,
1157
    },
1158
    {
1159
        6.912,
1160
        4.838,
1161
        3.997,
1162
        3.525,
1163
        3.218,
1164
        3.000,
1165
        2.835,
1166
        2.706,
1167
        2.602,
1168
        2.515,
1169
    },
1170
    {
1171
        6.909,
1172
        4.836,
1173
        3.995,
1174
        3.523,
1175
        3.216,
1176
        2.998,
1177
        2.833,
1178
        2.704,
1179
        2.600,
1180
        2.513,
1181
    },
1182
    {
1183
        6.906,
1184
        4.833,
1185
        3.992,
1186
        3.521,
1187
        3.214,
1188
        2.996,
1189
        2.831,
1190
        2.702,
1191
        2.598,
1192
        2.511,
1193
    },
1194
    {
1195
        6.904,
1196
        4.831,
1197
        3.990,
1198
        3.519,
1199
        3.212,
1200
        2.994,
1201
        2.829,
1202
        2.700,
1203
        2.596,
1204
        2.509,
1205
    },
1206
    {
1207
        6.901,
1208
        4.829,
1209
        3.988,
1210
        3.517,
1211
        3.210,
1212
        2.992,
1213
        2.827,
1214
        2.698,
1215
        2.594,
1216
        2.507,
1217
    },
1218
    {
1219
        6.898,
1220
        4.826,
1221
        3.986,
1222
        3.515,
1223
        3.208,
1224
        2.990,
1225
        2.825,
1226
        2.696,
1227
        2.592,
1228
        2.505,
1229
    },
1230
    {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1231
1232
/** define the variance which will be used as a minimum variance for any
1233
  dimension of any feature. Since most features are calculated from numbers
1234
  with a precision no better than 1 in 128, the variance should never be
1235
  less than the square of this number for parameters whose range is 1. */
1236
0
#define MINVARIANCE 0.0004
1237
1238
/** define the absolute minimum number of samples which must be present in
1239
  order to accurately test hypotheses about underlying probability
1240
  distributions.  Define separately the minimum samples that are needed
1241
  before a statistical analysis is attempted; this number should be
1242
  equal to MINSAMPLES but can be set to a lower number for early testing
1243
  when very few samples are available. */
1244
#define MINSAMPLESPERBUCKET 5
1245
#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1246
0
#define MINSAMPLESNEEDED 1
1247
1248
/** define the size of the table which maps normalized samples to
1249
  histogram buckets.  Also define the number of standard deviations
1250
  in a normal distribution which are considered to be significant.
1251
  The mapping table will be defined in such a way that it covers
1252
  the specified number of standard deviations on either side of
1253
  the mean.  BUCKETTABLESIZE should always be even. */
1254
0
#define BUCKETTABLESIZE 1024
1255
#define NORMALEXTENT 3.0
1256
1257
struct TEMPCLUSTER {
1258
  CLUSTER *Cluster;
1259
  CLUSTER *Neighbor;
1260
};
1261
1262
using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>;
1263
using ClusterHeap = tesseract::GenericHeap<ClusterPair>;
1264
1265
struct STATISTICS {
1266
0
  STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {
1267
0
  }
1268
  float AvgVariance = 1.0f;
1269
  std::vector<float> CoVariance;
1270
  std::vector<float> Min; // largest negative distance from the mean
1271
  std::vector<float> Max; // largest positive distance from the mean
1272
};
1273
1274
struct BUCKETS {
1275
0
  BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) {
1276
0
  }
1277
0
  ~BUCKETS() {
1278
0
  }
1279
  DISTRIBUTION Distribution = normal; // distribution being tested for
1280
  uint32_t SampleCount = 0;         // # of samples in histogram
1281
  double Confidence = 0.0;          // confidence level of test
1282
  double ChiSquared = 0.0;          // test threshold
1283
  uint16_t NumberOfBuckets;         // number of cells in histogram
1284
  uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
1285
  std::vector<uint32_t> Count;      // frequency of occurrence histogram
1286
  std::vector<float> ExpectedCount; // expected histogram
1287
};
1288
1289
struct CHISTRUCT {
1290
  /// This constructor allocates a new data structure which is used
1291
  /// to hold a chi-squared value along with its associated
1292
  /// number of degrees of freedom and alpha value.
1293
  ///
1294
  /// @param degreesOfFreedom  degrees of freedom for new chi value
1295
  /// @param alpha     confidence level for new chi value
1296
0
  CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {
1297
0
  }
1298
  uint16_t DegreesOfFreedom = 0;
1299
  double Alpha = 0.0;
1300
  double ChiSquared = 0.0;
1301
};
1302
1303
// For use with KDWalk / MakePotentialClusters
1304
struct ClusteringContext {
1305
  ClusterHeap *heap;       // heap used to hold temp clusters, "best" on top
1306
  TEMPCLUSTER *candidates; // array of potential clusters
1307
  KDTREE *tree;            // kd-tree to be searched for neighbors
1308
  int32_t next;            // next candidate to be used
1309
};
1310
1311
using DENSITYFUNC = double (*)(int32_t);
1312
using SOLVEFUNC = double (*)(CHISTRUCT *, double);
1313
1314
0
#define Odd(N) ((N) % 2)
1315
0
#define Mirror(N, R) ((R) - (N)-1)
1316
0
#define Abs(N) (((N) < 0) ? (-(N)) : (N))
1317
1318
//--------------Global Data Definitions and Declarations----------------------
1319
/** the following variables describe a discrete normal distribution
1320
  which is used by NormalDensity() and NormalBucket().  The
1321
  constant NORMALEXTENT determines how many standard
1322
  deviations of the distribution are mapped onto the fixed
1323
  discrete range of x.  x=0 is mapped to -NORMALEXTENT standard
1324
  deviations and x=BUCKETTABLESIZE is mapped to
1325
  +NORMALEXTENT standard deviations. */
1326
#define SqrtOf2Pi 2.506628275
1327
static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
1328
static const double kNormalVariance =
1329
    (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
1330
static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
1331
static const double kNormalMean = BUCKETTABLESIZE / 2;
1332
1333
/** define lookup tables used to compute the number of histogram buckets
1334
  that should be used for a given number of samples. */
1335
0
#define LOOKUPTABLESIZE 8
1336
#define MAXDEGREESOFFREEDOM MAXBUCKETS
1337
1338
static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200,  400, 600, 800,
1339
                                                      1000,       1500, 2000}; // number of samples
1340
1341
static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
1342
    MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets
1343
1344
/*-------------------------------------------------------------------------
1345
          Private Function Prototypes
1346
--------------------------------------------------------------------------*/
1347
static void CreateClusterTree(CLUSTERER *Clusterer);
1348
1349
static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1350
1351
static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);
1352
1353
static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1354
1355
static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
1356
1357
static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);
1358
1359
static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1360
                                      PROTOSTYLE Style, int32_t MinSamples);
1361
1362
static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
1363
                                      STATISTICS *Statistics);
1364
1365
static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1366
                                     BUCKETS *Buckets);
1367
1368
static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1369
                                      STATISTICS *Statistics, BUCKETS *Buckets);
1370
1371
static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1372
                                 BUCKETS *NormalBuckets, double Confidence);
1373
1374
static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1375
1376
static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1377
1378
static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1379
1380
static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1381
1382
static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1383
1384
static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1385
1386
static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1387
1388
static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);
1389
1390
static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
1391
                           double Confidence);
1392
1393
static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);
1394
1395
static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1396
1397
static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
1398
1399
static double NormalDensity(int32_t x);
1400
1401
static double UniformDensity(int32_t x);
1402
1403
static double Integral(double f1, double f2, double Dx);
1404
1405
static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1406
                        float Mean, float StdDev);
1407
1408
static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1409
1410
static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1411
1412
static bool DistributionOK(BUCKETS *Buckets);
1413
1414
static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1415
1416
static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1417
1418
static void InitBuckets(BUCKETS *Buckets);
1419
1420
static int AlphaMatch(void *arg1,  // CHISTRUCT *ChiStruct,
1421
                      void *arg2); // CHISTRUCT *SearchKey);
1422
1423
static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);
1424
1425
static double ChiArea(CHISTRUCT *ChiParams, double x);
1426
1427
static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);
1428
1429
static double InvertMatrix(const float *input, int size, float *inv);
1430
1431
//--------------------------Public Code--------------------------------------
1432
/**
1433
 * This routine creates a new clusterer data structure,
1434
 * initializes it, and returns a pointer to it.
1435
 *
1436
 * @param SampleSize  number of dimensions in feature space
1437
 * @param ParamDesc description of each dimension
1438
 * @return pointer to the new clusterer data structure
1439
 */
1440
0
CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
1441
0
  int i;
1442
1443
  // allocate main clusterer data structure and init simple fields
1444
0
  auto Clusterer = new CLUSTERER;
1445
0
  Clusterer->SampleSize = SampleSize;
1446
0
  Clusterer->NumberOfSamples = 0;
1447
0
  Clusterer->NumChar = 0;
1448
1449
  // init fields which will not be used initially
1450
0
  Clusterer->Root = nullptr;
1451
0
  Clusterer->ProtoList = NIL_LIST;
1452
1453
  // maintain a copy of param descriptors in the clusterer data structure
1454
0
  Clusterer->ParamDesc = new PARAM_DESC[SampleSize];
1455
0
  for (i = 0; i < SampleSize; i++) {
1456
0
    Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
1457
0
    Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
1458
0
    Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
1459
0
    Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
1460
0
    Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
1461
0
    Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1462
0
    Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
1463
0
  }
1464
1465
  // allocate a kd tree to hold the samples
1466
0
  Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);
1467
1468
  // Initialize cache of histogram buckets to minimize recomputing them.
1469
0
  for (auto &d : Clusterer->bucket_cache) {
1470
0
    for (auto &c : d) {
1471
0
      c = nullptr;
1472
0
    }
1473
0
  }
1474
1475
0
  return Clusterer;
1476
0
} // MakeClusterer
1477
1478
/**
1479
 * This routine creates a new sample data structure to hold
1480
 * the specified feature.  This sample is added to the clusterer
1481
 * data structure (so that it knows which samples are to be
1482
 * clustered later), and a pointer to the sample is returned to
1483
 * the caller.
1484
 *
1485
 * @param Clusterer clusterer data structure to add sample to
1486
 * @param Feature feature to be added to clusterer
1487
 * @param CharID  unique ident. of char that sample came from
1488
 *
1489
 * @return    Pointer to the new sample data structure
1490
 */
1491
0
SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
1492
0
  int i;
1493
1494
  // see if the samples have already been clustered - if so trap an error
1495
  // Can't add samples after they have been clustered.
1496
0
  ASSERT_HOST(Clusterer->Root == nullptr);
1497
1498
  // allocate the new sample and initialize it
1499
0
  auto Sample = new SAMPLE(Clusterer->SampleSize);
1500
0
  Sample->Clustered = false;
1501
0
  Sample->Prototype = false;
1502
0
  Sample->SampleCount = 1;
1503
0
  Sample->Left = nullptr;
1504
0
  Sample->Right = nullptr;
1505
0
  Sample->CharID = CharID;
1506
1507
0
  for (i = 0; i < Clusterer->SampleSize; i++) {
1508
0
    Sample->Mean[i] = Feature[i];
1509
0
  }
1510
1511
  // add the sample to the KD tree - keep track of the total # of samples
1512
0
  Clusterer->NumberOfSamples++;
1513
0
  KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);
1514
0
  if (CharID >= Clusterer->NumChar) {
1515
0
    Clusterer->NumChar = CharID + 1;
1516
0
  }
1517
1518
  // execute hook for monitoring clustering operation
1519
  // (*SampleCreationHook)(Sample);
1520
1521
0
  return (Sample);
1522
0
} // MakeSample
1523
1524
/**
1525
 * This routine first checks to see if the samples in this
1526
 * clusterer have already been clustered before; if so, it does
1527
 * not bother to recreate the cluster tree.  It simply recomputes
1528
 * the prototypes based on the new Config info.
1529
 *
1530
 * If the samples have not been clustered before, the
1531
 * samples in the KD tree are formed into a cluster tree and then
1532
 * the prototypes are computed from the cluster tree.
1533
 *
1534
 * In either case this routine returns a pointer to a
1535
 * list of prototypes that best represent the samples given
1536
 * the constraints specified in Config.
1537
 *
1538
 * @param Clusterer data struct containing samples to be clustered
1539
 * @param Config  parameters which control clustering process
1540
 *
1541
 * @return Pointer to a list of prototypes
1542
 */
1543
0
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1544
  // only create cluster tree if samples have never been clustered before
1545
0
  if (Clusterer->Root == nullptr) {
1546
0
    CreateClusterTree(Clusterer);
1547
0
  }
1548
1549
  // deallocate the old prototype list if one exists
1550
0
  FreeProtoList(&Clusterer->ProtoList);
1551
0
  Clusterer->ProtoList = NIL_LIST;
1552
1553
  // compute prototypes starting at the root node in the tree
1554
0
  ComputePrototypes(Clusterer, Config);
1555
  // We don't need the cluster pointers in the protos any more, so null them
1556
  // out, which makes it safe to delete the clusterer.
1557
0
  LIST proto_list = Clusterer->ProtoList;
1558
0
  iterate(proto_list) {
1559
0
    auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());
1560
0
    proto->Cluster = nullptr;
1561
0
  }
1562
0
  return Clusterer->ProtoList;
1563
0
} // ClusterSamples
1564
1565
/**
1566
 * This routine frees all of the memory allocated to the
1567
 * specified data structure.  It will not, however, free
1568
 * the memory used by the prototype list.  The pointers to
1569
 * the clusters for each prototype in the list will be set
1570
 * to nullptr to indicate that the cluster data structures no
1571
 * longer exist.  Any sample lists that have been obtained
1572
 * via calls to GetSamples are no longer valid.
1573
 * @param Clusterer pointer to data structure to be freed
1574
 */
1575
0
void FreeClusterer(CLUSTERER *Clusterer) {
1576
0
  if (Clusterer != nullptr) {
1577
0
    delete[] Clusterer->ParamDesc;
1578
0
    delete Clusterer->KDTree;
1579
0
    delete Clusterer->Root;
1580
    // Free up all used buckets structures.
1581
0
    for (auto &d : Clusterer->bucket_cache) {
1582
0
      for (auto &c : d) {
1583
0
        delete c;
1584
0
      }
1585
0
    }
1586
1587
0
    delete Clusterer;
1588
0
  }
1589
0
} // FreeClusterer
1590
1591
/**
1592
 * This routine frees all of the memory allocated to the
1593
 * specified list of prototypes.  The clusters which are
1594
 * pointed to by the prototypes are not freed.
1595
 * @param ProtoList pointer to list of prototypes to be freed
1596
 */
1597
0
void FreeProtoList(LIST *ProtoList) {
1598
0
  destroy_nodes(*ProtoList, FreePrototype);
1599
0
} // FreeProtoList
1600
1601
/**
1602
 * This routine deallocates the memory consumed by the specified
1603
 * prototype and modifies the corresponding cluster so that it
1604
 * is no longer marked as a prototype.  The cluster is NOT
1605
 * deallocated by this routine.
1606
 * @param arg prototype data structure to be deallocated
1607
 */
1608
0
void FreePrototype(void *arg) { // PROTOTYPE     *Prototype)
1609
0
  auto *Prototype = static_cast<PROTOTYPE *>(arg);
1610
1611
  // unmark the corresponding cluster (if there is one
1612
0
  if (Prototype->Cluster != nullptr) {
1613
0
    Prototype->Cluster->Prototype = false;
1614
0
  }
1615
1616
  // deallocate the prototype statistics and then the prototype itself
1617
0
  if (Prototype->Style != spherical) {
1618
0
    delete[] Prototype->Variance.Elliptical;
1619
0
    delete[] Prototype->Magnitude.Elliptical;
1620
0
    delete[] Prototype->Weight.Elliptical;
1621
0
  }
1622
0
  delete Prototype;
1623
0
} // FreePrototype
1624
1625
/**
1626
 * This routine is used to find all of the samples which
1627
 * belong to a cluster.  It starts by removing the top
1628
 * cluster on the cluster list (SearchState).  If this cluster is
1629
 * a leaf it is returned.  Otherwise, the right subcluster
1630
 * is pushed on the list and we continue the search in the
1631
 * left subcluster.  This continues until a leaf is found.
1632
 * If all samples have been found, nullptr is returned.
1633
 * InitSampleSearch() must be called
1634
 * before NextSample() to initialize the search.
1635
 * @param SearchState ptr to list containing clusters to be searched
1636
 * @return  Pointer to the next leaf cluster (sample) or nullptr.
1637
 */
1638
0
CLUSTER *NextSample(LIST *SearchState) {
1639
0
  CLUSTER *Cluster;
1640
1641
0
  if (*SearchState == NIL_LIST) {
1642
0
    return (nullptr);
1643
0
  }
1644
0
  Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());
1645
0
  *SearchState = pop(*SearchState);
1646
0
  for (;;) {
1647
0
    if (Cluster->Left == nullptr) {
1648
0
      return (Cluster);
1649
0
    }
1650
0
    *SearchState = push(*SearchState, Cluster->Right);
1651
0
    Cluster = Cluster->Left;
1652
0
  }
1653
0
} // NextSample
1654
1655
/**
1656
 * This routine returns the mean of the specified
1657
 * prototype in the indicated dimension.
1658
 * @param Proto prototype to return mean of
1659
 * @param Dimension dimension whose mean is to be returned
1660
 * @return  Mean of Prototype in Dimension
1661
 */
1662
0
float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
1663
0
  return (Proto->Mean[Dimension]);
1664
0
} // Mean
1665
1666
/**
1667
 * This routine returns the standard deviation of the
1668
 * prototype in the indicated dimension.
1669
 * @param Proto   prototype to return standard deviation of
1670
 * @param Dimension dimension whose stddev is to be returned
1671
 * @return  Standard deviation of Prototype in Dimension
1672
 */
1673
0
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
1674
0
  switch (Proto->Style) {
1675
0
    case spherical:
1676
0
      return std::sqrt(Proto->Variance.Spherical);
1677
0
    case elliptical:
1678
0
      return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1679
0
    case mixed:
1680
0
      switch (Proto->Distrib[Dimension]) {
1681
0
        case normal:
1682
0
          return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1683
0
        case uniform:
1684
0
        case D_random:
1685
0
          return Proto->Variance.Elliptical[Dimension];
1686
0
        case DISTRIBUTION_COUNT:
1687
0
          ASSERT_HOST(!"Distribution count not allowed!");
1688
0
      }
1689
0
  }
1690
0
  return 0.0f;
1691
0
} // StandardDeviation
1692
1693
/*---------------------------------------------------------------------------
1694
            Private Code
1695
----------------------------------------------------------------------------*/
1696
/**
1697
 * This routine performs a bottoms-up clustering on the samples
1698
 * held in the kd-tree of the Clusterer data structure.  The
1699
 * result is a cluster tree.  Each node in the tree represents
1700
 * a cluster which conceptually contains a subset of the samples.
1701
 * More precisely, the cluster contains all of the samples which
1702
 * are contained in its two sub-clusters.  The leaves of the
1703
 * tree are the individual samples themselves; they have no
1704
 * sub-clusters.  The root node of the tree conceptually contains
1705
 * all of the samples.
1706
 * The Clusterer data structure is changed.
1707
 * @param Clusterer data structure holdings samples to be clustered
1708
 */
1709
0
static void CreateClusterTree(CLUSTERER *Clusterer) {
1710
0
  ClusteringContext context;
1711
0
  ClusterPair HeapEntry;
1712
1713
  // each sample and its nearest neighbor form a "potential" cluster
1714
  // save these in a heap with the "best" potential clusters on top
1715
0
  context.tree = Clusterer->KDTree;
1716
0
  context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];
1717
0
  context.next = 0;
1718
0
  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
1719
0
  KDWalk(context.tree, MakePotentialClusters, &context);
1720
1721
  // form potential clusters into actual clusters - always do "best" first
1722
0
  while (context.heap->Pop(&HeapEntry)) {
1723
0
    TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1724
1725
    // if main cluster of potential cluster is already in another cluster
1726
    // then we don't need to worry about it
1727
0
    if (PotentialCluster->Cluster->Clustered) {
1728
0
      continue;
1729
0
    }
1730
1731
    // if main cluster is not yet clustered, but its nearest neighbor is
1732
    // then we must find a new nearest neighbor
1733
0
    else if (PotentialCluster->Neighbor->Clustered) {
1734
0
      PotentialCluster->Neighbor =
1735
0
          FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1736
0
      if (PotentialCluster->Neighbor != nullptr) {
1737
0
        context.heap->Push(&HeapEntry);
1738
0
      }
1739
0
    }
1740
1741
    // if neither cluster is already clustered, form permanent cluster
1742
0
    else {
1743
0
      PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1744
0
      PotentialCluster->Neighbor =
1745
0
          FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1746
0
      if (PotentialCluster->Neighbor != nullptr) {
1747
0
        context.heap->Push(&HeapEntry);
1748
0
      }
1749
0
    }
1750
0
  }
1751
1752
  // the root node in the cluster tree is now the only node in the kd-tree
1753
0
  Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);
1754
1755
  // free up the memory used by the K-D tree, heap, and temp clusters
1756
0
  delete context.tree;
1757
0
  Clusterer->KDTree = nullptr;
1758
0
  delete context.heap;
1759
0
  delete[] context.candidates;
1760
0
} // CreateClusterTree
1761
1762
/**
1763
 * This routine is designed to be used in concert with the
1764
 * KDWalk routine.  It will create a potential cluster for
1765
 * each sample in the kd-tree that is being walked.  This
1766
 * potential cluster will then be pushed on the heap.
1767
 * @param context  ClusteringContext (see definition above)
1768
 * @param Cluster  current cluster being visited in kd-tree walk
1769
 * @param Level  level of this cluster in the kd-tree
1770
 */
1771
0
static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {
1772
0
  ClusterPair HeapEntry;
1773
0
  int next = context->next;
1774
0
  context->candidates[next].Cluster = Cluster;
1775
0
  HeapEntry.data() = &(context->candidates[next]);
1776
0
  context->candidates[next].Neighbor =
1777
0
      FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1778
0
  if (context->candidates[next].Neighbor != nullptr) {
1779
0
    context->heap->Push(&HeapEntry);
1780
0
    context->next++;
1781
0
  }
1782
0
} // MakePotentialClusters
1783
1784
/**
1785
 * This routine searches the specified kd-tree for the nearest
1786
 * neighbor of the specified cluster.  It actually uses the
1787
 * kd routines to find the 2 nearest neighbors since one of them
1788
 * will be the original cluster.  A pointer to the nearest
1789
 * neighbor is returned, if it can be found, otherwise nullptr is
1790
 * returned.  The distance between the 2 nodes is placed
1791
 * in the specified variable.
1792
 * @param Tree    kd-tree to search in for nearest neighbor
1793
 * @param Cluster cluster whose nearest neighbor is to be found
1794
 * @param Distance  ptr to variable to report distance found
1795
 * @return  Pointer to the nearest neighbor of Cluster, or nullptr
1796
 */
1797
static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
1798
0
#define MAXNEIGHBORS 2
1799
0
#define MAXDISTANCE FLT_MAX
1800
0
{
1801
0
  CLUSTER *Neighbor[MAXNEIGHBORS];
1802
0
  float Dist[MAXNEIGHBORS];
1803
0
  int NumberOfNeighbors;
1804
0
  int32_t i;
1805
0
  CLUSTER *BestNeighbor;
1806
1807
  // find the 2 nearest neighbors of the cluster
1808
0
  KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,
1809
0
                          reinterpret_cast<void **>(Neighbor), Dist);
1810
1811
  // search for the nearest neighbor that is not the cluster itself
1812
0
  *Distance = MAXDISTANCE;
1813
0
  BestNeighbor = nullptr;
1814
0
  for (i = 0; i < NumberOfNeighbors; i++) {
1815
0
    if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1816
0
      *Distance = Dist[i];
1817
0
      BestNeighbor = Neighbor[i];
1818
0
    }
1819
0
  }
1820
0
  return BestNeighbor;
1821
0
} // FindNearestNeighbor
1822
1823
/**
1824
 * This routine creates a new permanent cluster from the
1825
 * clusters specified in TempCluster.  The 2 clusters in
1826
 * TempCluster are marked as "clustered" and deleted from
1827
 * the kd-tree.  The new cluster is then added to the kd-tree.
1828
 * @param Clusterer current clustering environment
1829
 * @param TempCluster potential cluster to make permanent
1830
 * @return Pointer to the new permanent cluster
1831
 */
1832
0
static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1833
  // allocate the new cluster and initialize it
1834
0
  auto Cluster = new CLUSTER(Clusterer->SampleSize);
1835
0
  Cluster->Clustered = false;
1836
0
  Cluster->Prototype = false;
1837
0
  Cluster->Left = TempCluster->Cluster;
1838
0
  Cluster->Right = TempCluster->Neighbor;
1839
0
  Cluster->CharID = -1;
1840
1841
  // mark the old clusters as "clustered" and delete them from the kd-tree
1842
0
  Cluster->Left->Clustered = true;
1843
0
  Cluster->Right->Clustered = true;
1844
0
  KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1845
0
  KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1846
1847
  // compute the mean and sample count for the new cluster
1848
0
  Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1849
0
                                       Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1850
0
                                       &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1851
1852
  // add the new cluster to the KD tree
1853
0
  KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1854
0
  return Cluster;
1855
0
} // MakeNewCluster
1856
1857
/**
1858
 * This routine merges two clusters into one larger cluster.
1859
 * To do this it computes the number of samples in the new
1860
 * cluster and the mean of the new cluster.  The ParamDesc
1861
 * information is used to ensure that circular dimensions
1862
 * are handled correctly.
1863
 * @param N # of dimensions (size of arrays)
1864
 * @param ParamDesc array of dimension descriptions
1865
 * @param n1, n2  number of samples in each old cluster
1866
 * @param m array to hold mean of new cluster
1867
 * @param m1, m2  arrays containing means of old clusters
1868
 * @return  The number of samples in the new cluster.
1869
 */
1870
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
1871
0
                      float m1[], float m2[]) {
1872
0
  int32_t i, n;
1873
1874
0
  n = n1 + n2;
1875
0
  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1876
0
    if (ParamDesc->Circular) {
1877
      // if distance between means is greater than allowed
1878
      // reduce upper point by one "rotation" to compute mean
1879
      // then normalize the mean back into the accepted range
1880
0
      if ((*m2 - *m1) > ParamDesc->HalfRange) {
1881
0
        *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
1882
0
        if (*m < ParamDesc->Min) {
1883
0
          *m += ParamDesc->Range;
1884
0
        }
1885
0
      } else if ((*m1 - *m2) > ParamDesc->HalfRange) {
1886
0
        *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
1887
0
        if (*m < ParamDesc->Min) {
1888
0
          *m += ParamDesc->Range;
1889
0
        }
1890
0
      } else {
1891
0
        *m = (n1 * *m1 + n2 * *m2) / n;
1892
0
      }
1893
0
    } else {
1894
0
      *m = (n1 * *m1 + n2 * *m2) / n;
1895
0
    }
1896
0
  }
1897
0
  return n;
1898
0
} // MergeClusters
1899
1900
/**
1901
 * This routine decides which clusters in the cluster tree
1902
 * should be represented by prototypes, forms a list of these
1903
 * prototypes, and places the list in the Clusterer data
1904
 * structure.
1905
 * @param Clusterer data structure holding cluster tree
1906
 * @param Config    parameters used to control prototype generation
1907
 */
1908
0
static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1909
0
  LIST ClusterStack = NIL_LIST;
1910
0
  CLUSTER *Cluster;
1911
0
  PROTOTYPE *Prototype;
1912
1913
  // use a stack to keep track of clusters waiting to be processed
1914
  // initially the only cluster on the stack is the root cluster
1915
0
  if (Clusterer->Root != nullptr) {
1916
0
    ClusterStack = push(NIL_LIST, Clusterer->Root);
1917
0
  }
1918
1919
  // loop until we have analyzed all clusters which are potential prototypes
1920
0
  while (ClusterStack != NIL_LIST) {
1921
    // remove the next cluster to be analyzed from the stack
1922
    // try to make a prototype from the cluster
1923
    // if successful, put it on the proto list, else split the cluster
1924
0
    Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());
1925
0
    ClusterStack = pop(ClusterStack);
1926
0
    Prototype = MakePrototype(Clusterer, Config, Cluster);
1927
0
    if (Prototype != nullptr) {
1928
0
      Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);
1929
0
    } else {
1930
0
      ClusterStack = push(ClusterStack, Cluster->Right);
1931
0
      ClusterStack = push(ClusterStack, Cluster->Left);
1932
0
    }
1933
0
  }
1934
0
} // ComputePrototypes
1935
1936
/**
1937
 * This routine attempts to create a prototype from the
1938
 * specified cluster that conforms to the distribution
1939
 * specified in Config.  If there are too few samples in the
1940
 * cluster to perform a statistical analysis, then a prototype
1941
 * is generated but labelled as insignificant.  If the
1942
 * dimensions of the cluster are not independent, no prototype
1943
 * is generated and nullptr is returned.  If a prototype can be
1944
 * found that matches the desired distribution then a pointer
1945
 * to it is returned, otherwise nullptr is returned.
1946
 * @param Clusterer data structure holding cluster tree
1947
 * @param Config  parameters used to control prototype generation
1948
 * @param Cluster cluster to be made into a prototype
1949
 * @return  Pointer to new prototype or nullptr
1950
 */
1951
0
static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {
1952
0
  PROTOTYPE *Proto;
1953
0
  BUCKETS *Buckets;
1954
1955
  // filter out clusters which contain samples from the same character
1956
0
  if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {
1957
0
    return nullptr;
1958
0
  }
1959
1960
  // compute the covariance matrix and ranges for the cluster
1961
0
  auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1962
1963
  // check for degenerate clusters which need not be analyzed further
1964
  // note that the MinSamples test assumes that all clusters with multiple
1965
  // character samples have been removed (as above)
1966
0
  Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1967
0
                              static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
1968
0
  if (Proto != nullptr) {
1969
0
    delete Statistics;
1970
0
    return Proto;
1971
0
  }
1972
  // check to ensure that all dimensions are independent
1973
0
  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1974
0
                   Config->Independence)) {
1975
0
    delete Statistics;
1976
0
    return nullptr;
1977
0
  }
1978
1979
0
  if (HOTELLING && Config->ProtoStyle == elliptical) {
1980
0
    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1981
0
    if (Proto != nullptr) {
1982
0
      delete Statistics;
1983
0
      return Proto;
1984
0
    }
1985
0
  }
1986
1987
  // create a histogram data structure used to evaluate distributions
1988
0
  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);
1989
1990
  // create a prototype based on the statistics and test it
1991
0
  switch (Config->ProtoStyle) {
1992
0
    case spherical:
1993
0
      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1994
0
      break;
1995
0
    case elliptical:
1996
0
      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1997
0
      break;
1998
0
    case mixed:
1999
0
      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2000
0
      break;
2001
0
    case automatic:
2002
0
      Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2003
0
      if (Proto != nullptr) {
2004
0
        break;
2005
0
      }
2006
0
      Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2007
0
      if (Proto != nullptr) {
2008
0
        break;
2009
0
      }
2010
0
      Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2011
0
      break;
2012
0
  }
2013
0
  delete Statistics;
2014
0
  return Proto;
2015
0
} // MakePrototype
2016
2017
/**
2018
 * This routine checks for clusters which are degenerate and
2019
 * therefore cannot be analyzed in a statistically valid way.
2020
 * A cluster is defined as degenerate if it does not have at
2021
 * least MINSAMPLESNEEDED samples in it.  If the cluster is
2022
 * found to be degenerate, a prototype of the specified style
2023
 * is generated and marked as insignificant.  A cluster is
2024
 * also degenerate if it does not have at least MinSamples
2025
 * samples in it.
2026
 *
2027
 * If the cluster is not degenerate, nullptr is returned.
2028
 *
2029
 * @param N   number of dimensions
2030
 * @param Cluster   cluster being analyzed
2031
 * @param Statistics  statistical info about cluster
2032
 * @param Style   type of prototype to be generated
2033
 * @param MinSamples  minimum number of samples in a cluster
2034
 * @return  Pointer to degenerate prototype or nullptr.
2035
 */
2036
static PROTOTYPE *MakeDegenerateProto( // this was MinSample
2037
0
    uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {
2038
0
  PROTOTYPE *Proto = nullptr;
2039
2040
0
  if (MinSamples < MINSAMPLESNEEDED) {
2041
0
    MinSamples = MINSAMPLESNEEDED;
2042
0
  }
2043
2044
0
  if (Cluster->SampleCount < MinSamples) {
2045
0
    switch (Style) {
2046
0
      case spherical:
2047
0
        Proto = NewSphericalProto(N, Cluster, Statistics);
2048
0
        break;
2049
0
      case elliptical:
2050
0
      case automatic:
2051
0
        Proto = NewEllipticalProto(N, Cluster, Statistics);
2052
0
        break;
2053
0
      case mixed:
2054
0
        Proto = NewMixedProto(N, Cluster, Statistics);
2055
0
        break;
2056
0
    }
2057
0
    Proto->Significant = false;
2058
0
  }
2059
0
  return (Proto);
2060
0
} // MakeDegenerateProto
2061
2062
/**
2063
 * This routine tests the specified cluster to see if **
2064
 * there is a statistically significant difference between
2065
 * the sub-clusters that would be made if the cluster were to
2066
 * be split. If not, then a new prototype is formed and
2067
 * returned to the caller. If there is, then nullptr is returned
2068
 * to the caller.
2069
 * @param Clusterer data struct containing samples being clustered
2070
 * @param Config provides the magic number of samples that make a good cluster
2071
 * @param Cluster   cluster to be made into an elliptical prototype
2072
 * @param Statistics  statistical info about cluster
2073
 * @return Pointer to new elliptical prototype or nullptr.
2074
 */
2075
static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
2076
0
                                      STATISTICS *Statistics) {
2077
  // Fraction of the number of samples used as a range around 1 within
2078
  // which a cluster has the magic size that allows a boost to the
2079
  // FTable by kFTableBoostMargin, thus allowing clusters near the
2080
  // magic size (equal to the number of sample characters) to be more
2081
  // likely to stay together.
2082
0
  const double kMagicSampleMargin = 0.0625;
2083
0
  const double kFTableBoostMargin = 2.0;
2084
2085
0
  int N = Clusterer->SampleSize;
2086
0
  CLUSTER *Left = Cluster->Left;
2087
0
  CLUSTER *Right = Cluster->Right;
2088
0
  if (Left == nullptr || Right == nullptr) {
2089
0
    return nullptr;
2090
0
  }
2091
0
  int TotalDims = Left->SampleCount + Right->SampleCount;
2092
0
  if (TotalDims < N + 1 || TotalDims < 2) {
2093
0
    return nullptr;
2094
0
  }
2095
0
  std::vector<float> Covariance(static_cast<size_t>(N) * N);
2096
0
  std::vector<float> Inverse(static_cast<size_t>(N) * N);
2097
0
  std::vector<float> Delta(N);
2098
  // Compute a new covariance matrix that only uses essential features.
2099
0
  for (int i = 0; i < N; ++i) {
2100
0
    int row_offset = i * N;
2101
0
    if (!Clusterer->ParamDesc[i].NonEssential) {
2102
0
      for (int j = 0; j < N; ++j) {
2103
0
        if (!Clusterer->ParamDesc[j].NonEssential) {
2104
0
          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2105
0
        } else {
2106
0
          Covariance[j + row_offset] = 0.0f;
2107
0
        }
2108
0
      }
2109
0
    } else {
2110
0
      for (int j = 0; j < N; ++j) {
2111
0
        if (i == j) {
2112
0
          Covariance[j + row_offset] = 1.0f;
2113
0
        } else {
2114
0
          Covariance[j + row_offset] = 0.0f;
2115
0
        }
2116
0
      }
2117
0
    }
2118
0
  }
2119
0
  double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2120
0
  if (err > 1) {
2121
0
    tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
2122
0
  }
2123
0
  int EssentialN = 0;
2124
0
  for (int dim = 0; dim < N; ++dim) {
2125
0
    if (!Clusterer->ParamDesc[dim].NonEssential) {
2126
0
      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2127
0
      ++EssentialN;
2128
0
    } else {
2129
0
      Delta[dim] = 0.0f;
2130
0
    }
2131
0
  }
2132
  // Compute Hotelling's T-squared.
2133
0
  double Tsq = 0.0;
2134
0
  for (int x = 0; x < N; ++x) {
2135
0
    double temp = 0.0;
2136
0
    for (int y = 0; y < N; ++y) {
2137
0
      temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
2138
0
    }
2139
0
    Tsq += Delta[x] * temp;
2140
0
  }
2141
  // Changed this function to match the formula in
2142
  // Statistical Methods in Medical Research p 473
2143
  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
2144
  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
2145
0
  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2146
0
  int Fx = EssentialN;
2147
0
  if (Fx > FTABLE_X) {
2148
0
    Fx = FTABLE_X;
2149
0
  }
2150
0
  --Fx;
2151
0
  int Fy = TotalDims - EssentialN - 1;
2152
0
  if (Fy > FTABLE_Y) {
2153
0
    Fy = FTABLE_Y;
2154
0
  }
2155
0
  --Fy;
2156
0
  double FTarget = FTable[Fy][Fx];
2157
0
  if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
2158
0
      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
2159
    // Give magic-sized clusters a magic FTable boost.
2160
0
    FTarget += kFTableBoostMargin;
2161
0
  }
2162
0
  if (F < FTarget) {
2163
0
    return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2164
0
  }
2165
0
  return nullptr;
2166
0
}
2167
2168
/**
2169
 * This routine tests the specified cluster to see if it can
2170
 * be approximated by a spherical normal distribution.  If it
2171
 * can be, then a new prototype is formed and returned to the
2172
 * caller.  If it can't be, then nullptr is returned to the caller.
2173
 * @param Clusterer data struct containing samples being clustered
2174
 * @param Cluster   cluster to be made into a spherical prototype
2175
 * @param Statistics  statistical info about cluster
2176
 * @param Buckets   histogram struct used to analyze distribution
2177
 * @return  Pointer to new spherical prototype or nullptr.
2178
 */
2179
static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2180
0
                                     BUCKETS *Buckets) {
2181
0
  PROTOTYPE *Proto = nullptr;
2182
0
  int i;
2183
2184
  // check that each dimension is a normal distribution
2185
0
  for (i = 0; i < Clusterer->SampleSize; i++) {
2186
0
    if (Clusterer->ParamDesc[i].NonEssential) {
2187
0
      continue;
2188
0
    }
2189
2190
0
    FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2191
0
                sqrt(static_cast<double>(Statistics->AvgVariance)));
2192
0
    if (!DistributionOK(Buckets)) {
2193
0
      break;
2194
0
    }
2195
0
  }
2196
  // if all dimensions matched a normal distribution, make a proto
2197
0
  if (i >= Clusterer->SampleSize) {
2198
0
    Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2199
0
  }
2200
0
  return (Proto);
2201
0
} // MakeSphericalProto
2202
2203
/**
2204
 * This routine tests the specified cluster to see if it can
2205
 * be approximated by an elliptical normal distribution.  If it
2206
 * can be, then a new prototype is formed and returned to the
2207
 * caller.  If it can't be, then nullptr is returned to the caller.
2208
 * @param Clusterer data struct containing samples being clustered
2209
 * @param Cluster   cluster to be made into an elliptical prototype
2210
 * @param Statistics  statistical info about cluster
2211
 * @param Buckets   histogram struct used to analyze distribution
2212
 * @return  Pointer to new elliptical prototype or nullptr.
2213
 */
2214
static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2215
0
                                      STATISTICS *Statistics, BUCKETS *Buckets) {
2216
0
  PROTOTYPE *Proto = nullptr;
2217
0
  int i;
2218
2219
  // check that each dimension is a normal distribution
2220
0
  for (i = 0; i < Clusterer->SampleSize; i++) {
2221
0
    if (Clusterer->ParamDesc[i].NonEssential) {
2222
0
      continue;
2223
0
    }
2224
2225
0
    FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2226
0
                sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2227
0
    if (!DistributionOK(Buckets)) {
2228
0
      break;
2229
0
    }
2230
0
  }
2231
  // if all dimensions matched a normal distribution, make a proto
2232
0
  if (i >= Clusterer->SampleSize) {
2233
0
    Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2234
0
  }
2235
0
  return (Proto);
2236
0
} // MakeEllipticalProto
2237
2238
/**
2239
 * This routine tests each dimension of the specified cluster to
2240
 * see what distribution would best approximate that dimension.
2241
 * Each dimension is compared to the following distributions
2242
 * in order: normal, random, uniform.  If each dimension can
2243
 * be represented by one of these distributions,
2244
 * then a new prototype is formed and returned to the
2245
 * caller.  If it can't be, then nullptr is returned to the caller.
2246
 * @param Clusterer data struct containing samples being clustered
2247
 * @param Cluster   cluster to be made into a prototype
2248
 * @param Statistics  statistical info about cluster
2249
 * @param NormalBuckets histogram struct used to analyze distribution
2250
 * @param Confidence  confidence level for alternate distributions
2251
 * @return  Pointer to new mixed prototype or nullptr.
2252
 */
2253
static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2254
0
                                 BUCKETS *NormalBuckets, double Confidence) {
2255
0
  PROTOTYPE *Proto;
2256
0
  int i;
2257
0
  BUCKETS *UniformBuckets = nullptr;
2258
0
  BUCKETS *RandomBuckets = nullptr;
2259
2260
  // create a mixed proto to work on - initially assume all dimensions normal
2261
0
  Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2262
2263
  // find the proper distribution for each dimension
2264
0
  for (i = 0; i < Clusterer->SampleSize; i++) {
2265
0
    if (Clusterer->ParamDesc[i].NonEssential) {
2266
0
      continue;
2267
0
    }
2268
2269
0
    FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2270
0
                std::sqrt(Proto->Variance.Elliptical[i]));
2271
0
    if (DistributionOK(NormalBuckets)) {
2272
0
      continue;
2273
0
    }
2274
2275
0
    if (RandomBuckets == nullptr) {
2276
0
      RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
2277
0
    }
2278
0
    MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2279
0
    FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2280
0
                Proto->Variance.Elliptical[i]);
2281
0
    if (DistributionOK(RandomBuckets)) {
2282
0
      continue;
2283
0
    }
2284
2285
0
    if (UniformBuckets == nullptr) {
2286
0
      UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
2287
0
    }
2288
0
    MakeDimUniform(i, Proto, Statistics);
2289
0
    FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2290
0
                Proto->Variance.Elliptical[i]);
2291
0
    if (DistributionOK(UniformBuckets)) {
2292
0
      continue;
2293
0
    }
2294
0
    break;
2295
0
  }
2296
  // if any dimension failed to match a distribution, discard the proto
2297
0
  if (i < Clusterer->SampleSize) {
2298
0
    FreePrototype(Proto);
2299
0
    Proto = nullptr;
2300
0
  }
2301
0
  return (Proto);
2302
0
} // MakeMixedProto
2303
2304
/**
2305
 * This routine alters the ith dimension of the specified
2306
 * mixed prototype to be D_random.
2307
 * @param i index of dimension to be changed
2308
 * @param Proto prototype whose dimension is to be altered
2309
 * @param ParamDesc description of specified dimension
2310
 */
2311
0
static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2312
0
  Proto->Distrib[i] = D_random;
2313
0
  Proto->Mean[i] = ParamDesc->MidRange;
2314
0
  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2315
2316
  // subtract out the previous magnitude of this dimension from the total
2317
0
  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2318
0
  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2319
0
  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2320
0
  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2321
2322
  // note that the proto Weight is irrelevant for D_random protos
2323
0
} // MakeDimRandom
2324
2325
/**
2326
 * This routine alters the ith dimension of the specified
2327
 * mixed prototype to be uniform.
2328
 * @param i index of dimension to be changed
2329
 * @param Proto   prototype whose dimension is to be altered
2330
 * @param Statistics  statistical info about prototype
2331
 */
2332
0
static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2333
0
  Proto->Distrib[i] = uniform;
2334
0
  Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2335
0
  Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2336
0
  if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2337
0
    Proto->Variance.Elliptical[i] = MINVARIANCE;
2338
0
  }
2339
2340
  // subtract out the previous magnitude of this dimension from the total
2341
0
  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2342
0
  Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2343
0
  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2344
0
  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2345
2346
  // note that the proto Weight is irrelevant for uniform protos
2347
0
} // MakeDimUniform
2348
2349
/**
2350
 * This routine searches the cluster tree for all leaf nodes
2351
 * which are samples in the specified cluster.  It computes
2352
 * a full covariance matrix for these samples as well as
2353
 * keeping track of the ranges (min and max) for each
2354
 * dimension.  A special data structure is allocated to
2355
 * return this information to the caller.  An incremental
2356
 * algorithm for computing statistics is not used because
2357
 * it will not work with circular dimensions.
2358
 * @param N number of dimensions
2359
 * @param ParamDesc array of dimension descriptions
2360
 * @param Cluster cluster whose stats are to be computed
2361
 * @return  Pointer to new data structure containing statistics
2362
 */
2363
0
static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2364
0
  int i, j;
2365
0
  LIST SearchState;
2366
0
  SAMPLE *Sample;
2367
0
  uint32_t SampleCountAdjustedForBias;
2368
2369
  // allocate memory to hold the statistics results
2370
0
  auto Statistics = new STATISTICS(N);
2371
2372
  // allocate temporary memory to hold the sample to mean distances
2373
0
  std::vector<float> Distance(N);
2374
2375
  // find each sample in the cluster and merge it into the statistics
2376
0
  InitSampleSearch(SearchState, Cluster);
2377
0
  while ((Sample = NextSample(&SearchState)) != nullptr) {
2378
0
    for (i = 0; i < N; i++) {
2379
0
      Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2380
0
      if (ParamDesc[i].Circular) {
2381
0
        if (Distance[i] > ParamDesc[i].HalfRange) {
2382
0
          Distance[i] -= ParamDesc[i].Range;
2383
0
        }
2384
0
        if (Distance[i] < -ParamDesc[i].HalfRange) {
2385
0
          Distance[i] += ParamDesc[i].Range;
2386
0
        }
2387
0
      }
2388
0
      if (Distance[i] < Statistics->Min[i]) {
2389
0
        Statistics->Min[i] = Distance[i];
2390
0
      }
2391
0
      if (Distance[i] > Statistics->Max[i]) {
2392
0
        Statistics->Max[i] = Distance[i];
2393
0
      }
2394
0
    }
2395
0
    auto CoVariance = &Statistics->CoVariance[0];
2396
0
    for (i = 0; i < N; i++) {
2397
0
      for (j = 0; j < N; j++, CoVariance++) {
2398
0
        *CoVariance += Distance[i] * Distance[j];
2399
0
      }
2400
0
    }
2401
0
  }
2402
  // normalize the variances by the total number of samples
2403
  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
2404
  // also compute the geometic mean of the diagonal variances
2405
  // ensure that clusters with only 1 sample are handled correctly
2406
0
  if (Cluster->SampleCount > 1) {
2407
0
    SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2408
0
  } else {
2409
0
    SampleCountAdjustedForBias = 1;
2410
0
  }
2411
0
  auto CoVariance = &Statistics->CoVariance[0];
2412
0
  for (i = 0; i < N; i++) {
2413
0
    for (j = 0; j < N; j++, CoVariance++) {
2414
0
      *CoVariance /= SampleCountAdjustedForBias;
2415
0
      if (j == i) {
2416
0
        if (*CoVariance < MINVARIANCE) {
2417
0
          *CoVariance = MINVARIANCE;
2418
0
        }
2419
0
        Statistics->AvgVariance *= *CoVariance;
2420
0
      }
2421
0
    }
2422
0
  }
2423
0
  Statistics->AvgVariance =
2424
0
      static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2425
2426
0
  return Statistics;
2427
0
} // ComputeStatistics
2428
2429
/**
2430
 * This routine creates a spherical prototype data structure to
2431
 * approximate the samples in the specified cluster.
2432
 * Spherical prototypes have a single variance which is
2433
 * common across all dimensions.  All dimensions are normally
2434
 * distributed and independent.
2435
 * @param N number of dimensions
2436
 * @param Cluster cluster to be made into a spherical prototype
2437
 * @param Statistics  statistical info about samples in cluster
2438
 * @return  Pointer to a new spherical prototype data structure
2439
 */
2440
0
static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2441
0
  PROTOTYPE *Proto;
2442
2443
0
  Proto = NewSimpleProto(N, Cluster);
2444
2445
0
  Proto->Variance.Spherical = Statistics->AvgVariance;
2446
0
  if (Proto->Variance.Spherical < MINVARIANCE) {
2447
0
    Proto->Variance.Spherical = MINVARIANCE;
2448
0
  }
2449
2450
0
  Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2451
0
  Proto->TotalMagnitude = static_cast<float>(
2452
0
      pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));
2453
0
  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2454
0
  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2455
2456
0
  return (Proto);
2457
0
} // NewSphericalProto
2458
2459
/**
2460
 * This routine creates an elliptical prototype data structure to
2461
 * approximate the samples in the specified cluster.
2462
 * Elliptical prototypes have a variance for each dimension.
2463
 * All dimensions are normally distributed and independent.
2464
 * @param N number of dimensions
2465
 * @param Cluster cluster to be made into an elliptical prototype
2466
 * @param Statistics  statistical info about samples in cluster
2467
 * @return  Pointer to a new elliptical prototype data structure
2468
 */
2469
0
static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2470
0
  PROTOTYPE *Proto;
2471
0
  int i;
2472
2473
0
  Proto = NewSimpleProto(N, Cluster);
2474
0
  Proto->Variance.Elliptical = new float[N];
2475
0
  Proto->Magnitude.Elliptical = new float[N];
2476
0
  Proto->Weight.Elliptical = new float[N];
2477
2478
0
  auto CoVariance = &Statistics->CoVariance[0];
2479
0
  Proto->TotalMagnitude = 1.0;
2480
0
  for (i = 0; i < N; i++, CoVariance += N + 1) {
2481
0
    Proto->Variance.Elliptical[i] = *CoVariance;
2482
0
    if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2483
0
      Proto->Variance.Elliptical[i] = MINVARIANCE;
2484
0
    }
2485
2486
0
    Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2487
0
    Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2488
0
    Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2489
0
  }
2490
0
  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2491
0
  Proto->Style = elliptical;
2492
0
  return (Proto);
2493
0
} // NewEllipticalProto
2494
2495
/**
2496
 * This routine creates a mixed prototype data structure to
2497
 * approximate the samples in the specified cluster.
2498
 * Mixed prototypes can have different distributions for
2499
 * each dimension.  All dimensions are independent.  The
2500
 * structure is initially filled in as though it were an
2501
 * elliptical prototype.  The actual distributions of the
2502
 * dimensions can be altered by other routines.
2503
 * @param N number of dimensions
2504
 * @param Cluster cluster to be made into a mixed prototype
2505
 * @param Statistics  statistical info about samples in cluster
2506
 * @return  Pointer to a new mixed prototype data structure
2507
 */
2508
0
static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2509
0
  auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2510
0
  Proto->Distrib.clear();
2511
0
  Proto->Distrib.resize(N, normal);
2512
0
  Proto->Style = mixed;
2513
0
  return Proto;
2514
0
} // NewMixedProto
2515
2516
/**
2517
 * This routine allocates memory to hold a simple prototype
2518
 * data structure, i.e. one without independent distributions
2519
 * and variances for each dimension.
2520
 * @param N number of dimensions
2521
 * @param Cluster cluster to be made into a prototype
2522
 * @return  Pointer to new simple prototype
2523
 */
2524
0
static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2525
0
  auto Proto = new PROTOTYPE;
2526
0
  Proto->Mean = Cluster->Mean;
2527
0
  Proto->Distrib.clear();
2528
0
  Proto->Significant = true;
2529
0
  Proto->Merged = false;
2530
0
  Proto->Style = spherical;
2531
0
  Proto->NumSamples = Cluster->SampleCount;
2532
0
  Proto->Cluster = Cluster;
2533
0
  Proto->Cluster->Prototype = true;
2534
0
  return Proto;
2535
0
} // NewSimpleProto
2536
2537
/**
2538
 * This routine returns true if the specified covariance
2539
 * matrix indicates that all N dimensions are independent of
2540
 * one another.  One dimension is judged to be independent of
2541
 * another when the magnitude of the corresponding correlation
2542
 * coefficient is
2543
 * less than the specified Independence factor.  The
2544
 * correlation coefficient is calculated as: (see Duda and
2545
 * Hart, pg. 247)
2546
 * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
2547
 * The covariance matrix is assumed to be symmetric (which
2548
 * should always be true).
2549
 * @param ParamDesc descriptions of each feature space dimension
2550
 * @param N number of dimensions
2551
 * @param CoVariance  ptr to a covariance matrix
2552
 * @param Independence  max off-diagonal correlation coefficient
2553
 * @return true if dimensions are independent, false otherwise
2554
 */
2555
0
static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {
2556
0
  int i, j;
2557
0
  float *VARii; // points to ith on-diagonal element
2558
0
  float *VARjj; // points to jth on-diagonal element
2559
0
  float CorrelationCoeff;
2560
2561
0
  VARii = CoVariance;
2562
0
  for (i = 0; i < N; i++, VARii += N + 1) {
2563
0
    if (ParamDesc[i].NonEssential) {
2564
0
      continue;
2565
0
    }
2566
2567
0
    VARjj = VARii + N + 1;
2568
0
    CoVariance = VARii + 1;
2569
0
    for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2570
0
      if (ParamDesc[j].NonEssential) {
2571
0
        continue;
2572
0
      }
2573
2574
0
      if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2575
0
        CorrelationCoeff = 0.0;
2576
0
      } else {
2577
0
        CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2578
0
      }
2579
0
      if (CorrelationCoeff > Independence) {
2580
0
        return false;
2581
0
      }
2582
0
    }
2583
0
  }
2584
0
  return true;
2585
0
} // Independent
2586
2587
/**
2588
 * This routine returns a histogram data structure which can
2589
 * be used by other routines to place samples into histogram
2590
 * buckets, and then apply a goodness of fit test to the
2591
 * histogram data to determine if the samples belong to the
2592
 * specified probability distribution.  The routine keeps
2593
 * a list of bucket data structures which have already been
2594
 * created so that it minimizes the computation time needed
2595
 * to create a new bucket.
2596
 * @param clusterer  which keeps a bucket_cache for us.
2597
 * @param Distribution  type of probability distribution to test for
2598
 * @param SampleCount number of samples that are available
2599
 * @param Confidence  probability of a Type I error
2600
 * @return  Bucket data structure
2601
 */
2602
static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
2603
0
                           double Confidence) {
2604
  // Get an old bucket structure with the same number of buckets.
2605
0
  uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2606
0
  BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
2607
2608
  // If a matching bucket structure is not found, make one and save it.
2609
0
  if (Buckets == nullptr) {
2610
0
    Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2611
0
    clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;
2612
0
  } else {
2613
    // Just adjust the existing buckets.
2614
0
    if (SampleCount != Buckets->SampleCount) {
2615
0
      AdjustBuckets(Buckets, SampleCount);
2616
0
    }
2617
0
    if (Confidence != Buckets->Confidence) {
2618
0
      Buckets->Confidence = Confidence;
2619
0
      Buckets->ChiSquared =
2620
0
          ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2621
0
    }
2622
0
    InitBuckets(Buckets);
2623
0
  }
2624
0
  return Buckets;
2625
0
} // GetBuckets
2626
2627
/**
2628
 * This routine creates a histogram data structure which can
2629
 * be used by other routines to place samples into histogram
2630
 * buckets, and then apply a goodness of fit test to the
2631
 * histogram data to determine if the samples belong to the
2632
 * specified probability distribution.  The buckets are
2633
 * allocated in such a way that the expected frequency of
2634
 * samples in each bucket is approximately the same.  In
2635
 * order to make this possible, a mapping table is
2636
 * computed which maps "normalized" samples into the
2637
 * appropriate bucket.
2638
 * @param Distribution  type of probability distribution to test for
2639
 * @param SampleCount number of samples that are available
2640
 * @param Confidence  probability of a Type I error
2641
 * @return Pointer to new histogram data structure
2642
 */
2643
0
static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {
2644
0
  const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2645
0
  int i, j;
2646
0
  double BucketProbability;
2647
0
  double NextBucketBoundary;
2648
0
  double Probability;
2649
0
  double ProbabilityDelta;
2650
0
  double LastProbDensity;
2651
0
  double ProbDensity;
2652
0
  uint16_t CurrentBucket;
2653
0
  bool Symmetrical;
2654
2655
  // allocate memory needed for data structure
2656
0
  auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2657
0
  Buckets->SampleCount = SampleCount;
2658
0
  Buckets->Confidence = Confidence;
2659
2660
  // initialize simple fields
2661
0
  Buckets->Distribution = Distribution;
2662
2663
  // all currently defined distributions are symmetrical
2664
0
  Symmetrical = true;
2665
0
  Buckets->ChiSquared =
2666
0
      ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2667
2668
0
  if (Symmetrical) {
2669
    // allocate buckets so that all have approx. equal probability
2670
0
    BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
2671
2672
    // distribution is symmetric so fill in upper half then copy
2673
0
    CurrentBucket = Buckets->NumberOfBuckets / 2;
2674
0
    if (Odd(Buckets->NumberOfBuckets)) {
2675
0
      NextBucketBoundary = BucketProbability / 2;
2676
0
    } else {
2677
0
      NextBucketBoundary = BucketProbability;
2678
0
    }
2679
2680
0
    Probability = 0.0;
2681
0
    LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);
2682
0
    for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
2683
0
      ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);
2684
0
      ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2685
0
      Probability += ProbabilityDelta;
2686
0
      if (Probability > NextBucketBoundary) {
2687
0
        if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2688
0
          CurrentBucket++;
2689
0
        }
2690
0
        NextBucketBoundary += BucketProbability;
2691
0
      }
2692
0
      Buckets->Bucket[i] = CurrentBucket;
2693
0
      Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);
2694
0
      LastProbDensity = ProbDensity;
2695
0
    }
2696
    // place any leftover probability into the last bucket
2697
0
    Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);
2698
2699
    // copy upper half of distribution to lower half
2700
0
    for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {
2701
0
      Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2702
0
    }
2703
2704
    // copy upper half of expected counts to lower half
2705
0
    for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2706
0
      Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2707
0
    }
2708
0
  }
2709
0
  return Buckets;
2710
0
} // MakeBuckets
2711
2712
/**
2713
 * This routine computes the optimum number of histogram
2714
 * buckets that should be used in a chi-squared goodness of
2715
 * fit test for the specified number of samples.  The optimum
2716
 * number is computed based on Table 4.1 on pg. 147 of
2717
 * "Measurement and Analysis of Random Data" by Bendat & Piersol.
2718
 * Linear interpolation is used to interpolate between table
2719
 * values.  The table is intended for a 0.05 level of
2720
 * significance (alpha).  This routine assumes that it is
2721
 * equally valid for other alpha's, which may not be true.
2722
 * @param SampleCount number of samples to be tested
2723
 * @return Optimum number of histogram buckets
2724
 */
2725
0
static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2726
0
  uint8_t Last, Next;
2727
0
  float Slope;
2728
2729
0
  if (SampleCount < kCountTable[0]) {
2730
0
    return kBucketsTable[0];
2731
0
  }
2732
2733
0
  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
2734
0
    if (SampleCount <= kCountTable[Next]) {
2735
0
      Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2736
0
              static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2737
0
      return (
2738
0
          static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2739
0
    }
2740
0
  }
2741
0
  return kBucketsTable[Last];
2742
0
} // OptimumNumberOfBuckets
2743
2744
/**
2745
 * This routine computes the chi-squared value which will
2746
 * leave a cumulative probability of Alpha in the right tail
2747
 * of a chi-squared distribution with the specified number of
2748
 * degrees of freedom.  Alpha must be between 0 and 1.
2749
 * DegreesOfFreedom must be even.  The routine maintains an
2750
 * array of lists.  Each list corresponds to a different
2751
 * number of degrees of freedom.  Each entry in the list
2752
 * corresponds to a different alpha value and its corresponding
2753
 * chi-squared value.  Therefore, once a particular chi-squared
2754
 * value is computed, it is stored in the list and never
2755
 * needs to be computed again.
2756
 * @param DegreesOfFreedom  determines shape of distribution
2757
 * @param Alpha probability of right tail
2758
 * @return Desired chi-squared value
2759
 */
2760
static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
2761
0
#define CHIACCURACY 0.01
2762
0
#define MINALPHA (1e-200)
2763
0
{
2764
0
  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
2765
2766
  // limit the minimum alpha that can be used - if alpha is too small
2767
  //      it may not be possible to compute chi-squared.
2768
0
  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
2769
0
  if (Odd(DegreesOfFreedom)) {
2770
0
    DegreesOfFreedom++;
2771
0
  }
2772
2773
  /* find the list of chi-squared values which have already been computed
2774
   for the specified number of degrees of freedom.  Search the list for
2775
   the desired chi-squared. */
2776
0
  CHISTRUCT SearchKey(0.0, Alpha);
2777
0
  auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch);
2778
0
  auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr);
2779
2780
0
  if (OldChiSquared == nullptr) {
2781
0
    OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);
2782
0
    OldChiSquared->ChiSquared =
2783
0
        Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);
2784
0
    ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);
2785
0
  } else {
2786
    // further optimization might move OldChiSquared to front of list
2787
0
  }
2788
2789
0
  return (OldChiSquared->ChiSquared);
2790
2791
0
} // ComputeChiSquared
2792
2793
/**
2794
 * This routine computes the probability density function
2795
 * of a discrete normal distribution defined by the global
2796
 * variables kNormalMean, kNormalVariance, and kNormalMagnitude.
2797
 * Normal magnitude could, of course, be computed in terms of
2798
 * the normal variance but it is precomputed for efficiency.
2799
 * @param x number to compute the normal probability density for
2800
 * @note Globals:
2801
 *    kNormalMean mean of a discrete normal distribution
2802
 *    kNormalVariance variance of a discrete normal distribution
2803
 *    kNormalMagnitude  magnitude of a discrete normal distribution
2804
 * @return  The value of the normal distribution at x.
2805
 */
2806
0
static double NormalDensity(int32_t x) {
2807
0
  double Distance;
2808
2809
0
  Distance = x - kNormalMean;
2810
0
  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2811
0
} // NormalDensity
2812
2813
/**
2814
 * This routine computes the probability density function
2815
 * of a uniform distribution at the specified point.  The
2816
 * range of the distribution is from 0 to BUCKETTABLESIZE.
2817
 * @param x number to compute the uniform probability density for
2818
 * @return The value of the uniform distribution at x.
2819
 */
2820
0
static double UniformDensity(int32_t x) {
2821
0
  constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
2822
2823
0
  if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
2824
0
    return UniformDistributionDensity;
2825
0
  } else {
2826
0
    return 0.0;
2827
0
  }
2828
0
} // UniformDensity
2829
2830
/**
2831
 * This routine computes a trapezoidal approximation to the
2832
 * integral of a function over a small delta in x.
2833
 * @param f1  value of function at x1
2834
 * @param f2  value of function at x2
2835
 * @param Dx  x2 - x1 (should always be positive)
2836
 * @return Approximation of the integral of the function from x1 to x2.
2837
 */
2838
0
static double Integral(double f1, double f2, double Dx) {
2839
0
  return (f1 + f2) * Dx / 2.0;
2840
0
} // Integral
2841
2842
/**
2843
 * This routine counts the number of cluster samples which
2844
 * fall within the various histogram buckets in Buckets.  Only
2845
 * one dimension of each sample is examined.  The exact meaning
2846
 * of the Mean and StdDev parameters depends on the
2847
 * distribution which is being analyzed (this info is in the
2848
 * Buckets data structure).  For normal distributions, Mean
2849
 * and StdDev have the expected meanings.  For uniform and
2850
 * random distributions the Mean is the center point of the
2851
 * range and the StdDev is 1/2 the range.  A dimension with
2852
 * zero standard deviation cannot be statistically analyzed.
2853
 * In this case, a pseudo-analysis is used.
2854
 * The Buckets data structure is filled in.
2855
 * @param Buckets histogram buckets to count samples
2856
 * @param Cluster cluster whose samples are being analyzed
2857
 * @param Dim dimension of samples which is being analyzed
2858
 * @param ParamDesc description of the dimension
2859
 * @param Mean  "mean" of the distribution
2860
 * @param StdDev  "standard deviation" of the distribution
2861
 */
2862
static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2863
0
                        float Mean, float StdDev) {
2864
0
  uint16_t BucketID;
2865
0
  int i;
2866
0
  LIST SearchState;
2867
0
  SAMPLE *Sample;
2868
2869
  // initialize the histogram bucket counts to 0
2870
0
  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2871
0
    Buckets->Count[i] = 0;
2872
0
  }
2873
2874
0
  if (StdDev == 0.0) {
2875
    /* if the standard deviation is zero, then we can't statistically
2876
   analyze the cluster.  Use a pseudo-analysis: samples exactly on
2877
   the mean are distributed evenly across all buckets.  Samples greater
2878
   than the mean are placed in the last bucket; samples less than the
2879
   mean are placed in the first bucket. */
2880
2881
0
    InitSampleSearch(SearchState, Cluster);
2882
0
    i = 0;
2883
0
    while ((Sample = NextSample(&SearchState)) != nullptr) {
2884
0
      if (Sample->Mean[Dim] > Mean) {
2885
0
        BucketID = Buckets->NumberOfBuckets - 1;
2886
0
      } else if (Sample->Mean[Dim] < Mean) {
2887
0
        BucketID = 0;
2888
0
      } else {
2889
0
        BucketID = i;
2890
0
      }
2891
0
      Buckets->Count[BucketID] += 1;
2892
0
      i++;
2893
0
      if (i >= Buckets->NumberOfBuckets) {
2894
0
        i = 0;
2895
0
      }
2896
0
    }
2897
0
  } else {
2898
    // search for all samples in the cluster and add to histogram buckets
2899
0
    InitSampleSearch(SearchState, Cluster);
2900
0
    while ((Sample = NextSample(&SearchState)) != nullptr) {
2901
0
      switch (Buckets->Distribution) {
2902
0
        case normal:
2903
0
          BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2904
0
          break;
2905
0
        case D_random:
2906
0
        case uniform:
2907
0
          BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2908
0
          break;
2909
0
        default:
2910
0
          BucketID = 0;
2911
0
      }
2912
0
      Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2913
0
    }
2914
0
  }
2915
0
} // FillBuckets
2916
2917
/**
2918
 * This routine determines which bucket x falls into in the
2919
 * discrete normal distribution defined by kNormalMean
2920
 * and kNormalStdDev.  x values which exceed the range of
2921
 * the discrete distribution are clipped.
2922
 * @param ParamDesc used to identify circular dimensions
2923
 * @param x value to be normalized
2924
 * @param Mean  mean of normal distribution
2925
 * @param StdDev  standard deviation of normal distribution
2926
 * @return Bucket number into which x falls
2927
 */
2928
0
static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2929
0
  float X;
2930
2931
  // wraparound circular parameters if necessary
2932
0
  if (ParamDesc->Circular) {
2933
0
    if (x - Mean > ParamDesc->HalfRange) {
2934
0
      x -= ParamDesc->Range;
2935
0
    } else if (x - Mean < -ParamDesc->HalfRange) {
2936
0
      x += ParamDesc->Range;
2937
0
    }
2938
0
  }
2939
2940
0
  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2941
0
  if (X < 0) {
2942
0
    return 0;
2943
0
  }
2944
0
  if (X > BUCKETTABLESIZE - 1) {
2945
0
    return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
2946
0
  }
2947
0
  return static_cast<uint16_t>(floor(static_cast<double>(X)));
2948
0
} // NormalBucket
2949
2950
/**
2951
 * This routine determines which bucket x falls into in the
2952
 * discrete uniform distribution defined by
2953
 * BUCKETTABLESIZE.  x values which exceed the range of
2954
 * the discrete distribution are clipped.
2955
 * @param ParamDesc used to identify circular dimensions
2956
 * @param x value to be normalized
2957
 * @param Mean  center of range of uniform distribution
2958
 * @param StdDev  1/2 the range of the uniform distribution
2959
 * @return Bucket number into which x falls
2960
 */
2961
0
static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2962
0
  float X;
2963
2964
  // wraparound circular parameters if necessary
2965
0
  if (ParamDesc->Circular) {
2966
0
    if (x - Mean > ParamDesc->HalfRange) {
2967
0
      x -= ParamDesc->Range;
2968
0
    } else if (x - Mean < -ParamDesc->HalfRange) {
2969
0
      x += ParamDesc->Range;
2970
0
    }
2971
0
  }
2972
2973
0
  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2974
0
  if (X < 0) {
2975
0
    return 0;
2976
0
  }
2977
0
  if (X > BUCKETTABLESIZE - 1) {
2978
0
    return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
2979
0
  }
2980
0
  return static_cast<uint16_t>(floor(static_cast<double>(X)));
2981
0
} // UniformBucket
2982
2983
/**
2984
 * This routine performs a chi-square goodness of fit test
2985
 * on the histogram data in the Buckets data structure.
2986
 * true is returned if the histogram matches the probability
2987
 * distribution which was specified when the Buckets
2988
 * structure was originally created.  Otherwise false is
2989
 * returned.
2990
 * @param Buckets   histogram data to perform chi-square test on
2991
 * @return true if samples match distribution, false otherwise
2992
 */
2993
0
static bool DistributionOK(BUCKETS *Buckets) {
2994
0
  float FrequencyDifference;
2995
0
  float TotalDifference;
2996
0
  int i;
2997
2998
  // compute how well the histogram matches the expected histogram
2999
0
  TotalDifference = 0.0;
3000
0
  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3001
0
    FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3002
0
    TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3003
0
  }
3004
3005
  // test to see if the difference is more than expected
3006
0
  if (TotalDifference > Buckets->ChiSquared) {
3007
0
    return false;
3008
0
  } else {
3009
0
    return true;
3010
0
  }
3011
0
} // DistributionOK
3012
3013
/**
3014
 * This routine computes the degrees of freedom that should
3015
 * be used in a chi-squared test with the specified number of
3016
 * histogram buckets.  The result is always rounded up to
3017
 * the next even number so that the value of chi-squared can be
3018
 * computed more easily.  This will cause the value of
3019
 * chi-squared to be higher than the optimum value, resulting
3020
 * in the chi-square test being more lenient than optimum.
3021
 * @param Distribution    distribution being tested for
3022
 * @param HistogramBuckets  number of buckets in chi-square test
3023
 * @return The number of degrees of freedom for a chi-square test
3024
 */
3025
0
static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3026
0
  static uint8_t DegreeOffsets[] = {3, 3, 1};
3027
3028
0
  uint16_t AdjustedNumBuckets;
3029
3030
0
  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
3031
0
  if (Odd(AdjustedNumBuckets)) {
3032
0
    AdjustedNumBuckets++;
3033
0
  }
3034
0
  return (AdjustedNumBuckets);
3035
3036
0
} // DegreesOfFreedom
3037
3038
/**
3039
 * This routine multiplies each ExpectedCount histogram entry
3040
 * by NewSampleCount/OldSampleCount so that the histogram
3041
 * is now adjusted to the new sample count.
3042
 * @param Buckets histogram data structure to adjust
3043
 * @param NewSampleCount  new sample count to adjust to
3044
 */
3045
0
static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3046
0
  int i;
3047
0
  double AdjustFactor;
3048
3049
0
  AdjustFactor =
3050
0
      ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));
3051
3052
0
  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3053
0
    Buckets->ExpectedCount[i] *= AdjustFactor;
3054
0
  }
3055
3056
0
  Buckets->SampleCount = NewSampleCount;
3057
3058
0
} // AdjustBuckets
3059
3060
/**
3061
 * This routine sets the bucket counts in the specified histogram
3062
 * to zero.
3063
 * @param Buckets histogram data structure to init
3064
 */
3065
0
static void InitBuckets(BUCKETS *Buckets) {
3066
0
  int i;
3067
3068
0
  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3069
0
    Buckets->Count[i] = 0;
3070
0
  }
3071
3072
0
} // InitBuckets
3073
3074
/**
3075
 * This routine is used to search a list of structures which
3076
 * hold pre-computed chi-squared values for a chi-squared
3077
 * value whose corresponding alpha field matches the alpha
3078
 * field of SearchKey.
3079
 *
3080
 * It is called by the list search routines.
3081
 *
3082
 * @param arg1 chi-squared struct being tested for a match
3083
 * @param arg2 chi-squared struct that is the search key
3084
 * @return true if ChiStruct's Alpha matches SearchKey's Alpha
3085
 */
3086
static int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,
3087
0
                      void *arg2) { // CHISTRUCT *SearchKey)
3088
0
  auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
3089
0
  auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
3090
3091
0
  return (ChiStruct->Alpha == SearchKey->Alpha);
3092
3093
0
} // AlphaMatch
3094
3095
/**
3096
 * This routine attempts to find an x value at which Function
3097
 * goes to zero (i.e. a root of the function).  It will only
3098
 * work correctly if a solution actually exists and there
3099
 * are no extrema between the solution and the InitialGuess.
3100
 * The algorithms used are extremely primitive.
3101
 *
3102
 * @param Function  function whose zero is to be found
3103
 * @param FunctionParams  arbitrary data to pass to function
3104
 * @param InitialGuess  point to start solution search at
3105
 * @param Accuracy  maximum allowed error
3106
 * @return Solution of function (x for which f(x) = 0).
3107
 */
3108
static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
3109
0
#define INITIALDELTA 0.1
3110
0
#define DELTARATIO 0.1
3111
0
{
3112
0
  double x;
3113
0
  double f;
3114
0
  double Slope;
3115
0
  double Delta;
3116
0
  double NewDelta;
3117
0
  double xDelta;
3118
0
  double LastPosX, LastNegX;
3119
3120
0
  x = InitialGuess;
3121
0
  Delta = INITIALDELTA;
3122
0
  LastPosX = FLT_MAX;
3123
0
  LastNegX = -FLT_MAX;
3124
0
  f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3125
0
  while (Abs(LastPosX - LastNegX) > Accuracy) {
3126
    // keep track of outer bounds of current estimate
3127
0
    if (f < 0) {
3128
0
      LastNegX = x;
3129
0
    } else {
3130
0
      LastPosX = x;
3131
0
    }
3132
3133
    // compute the approx. slope of f(x) at the current point
3134
0
    Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
3135
3136
    // compute the next solution guess */
3137
0
    xDelta = f / Slope;
3138
0
    x -= xDelta;
3139
3140
    // reduce the delta used for computing slope to be a fraction of
3141
    // the amount moved to get to the new guess
3142
0
    NewDelta = Abs(xDelta) * DELTARATIO;
3143
0
    if (NewDelta < Delta) {
3144
0
      Delta = NewDelta;
3145
0
    }
3146
3147
    // compute the value of the function at the new guess
3148
0
    f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3149
0
  }
3150
0
  return (x);
3151
3152
0
} // Solve
3153
3154
/**
3155
 * This routine computes the area under a chi density curve
3156
 * from 0 to x, minus the desired area under the curve.  The
3157
 * number of degrees of freedom of the chi curve is specified
3158
 * in the ChiParams structure.  The desired area is also
3159
 * specified in the ChiParams structure as Alpha (or 1 minus
3160
 * the desired area).  This routine is intended to be passed
3161
 * to the Solve() function to find the value of chi-squared
3162
 * which will yield a desired area under the right tail of
3163
 * the chi density curve.  The function will only work for
3164
 * even degrees of freedom.  The equations are based on
3165
 * integrating the chi density curve in parts to obtain
3166
 * a series that can be used to compute the area under the
3167
 * curve.
3168
 * @param ChiParams contains degrees of freedom and alpha
3169
 * @param x   value of chi-squared to evaluate
3170
 * @return Error between actual and desired area under the chi curve.
3171
 */
3172
0
static double ChiArea(CHISTRUCT *ChiParams, double x) {
3173
0
  int i, N;
3174
0
  double SeriesTotal;
3175
0
  double Denominator;
3176
0
  double PowerOfx;
3177
3178
0
  N = ChiParams->DegreesOfFreedom / 2 - 1;
3179
0
  SeriesTotal = 1;
3180
0
  Denominator = 1;
3181
0
  PowerOfx = 1;
3182
0
  for (i = 1; i <= N; i++) {
3183
0
    Denominator *= 2 * i;
3184
0
    PowerOfx *= x;
3185
0
    SeriesTotal += PowerOfx / Denominator;
3186
0
  }
3187
0
  return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3188
3189
0
} // ChiArea
3190
3191
/**
3192
 * This routine looks at all samples in the specified cluster.
3193
 * It computes a running estimate of the percentage of the
3194
 * characters which have more than 1 sample in the cluster.
3195
 * When this percentage exceeds MaxIllegal, true is returned.
3196
 * Otherwise false is returned.  The CharID
3197
 * fields must contain integers which identify the training
3198
 * characters which were used to generate the sample.  One
3199
 * integer is used for each sample.  The NumChar field in
3200
 * the Clusterer must contain the number of characters in the
3201
 * training set.  All CharID fields must be between 0 and
3202
 * NumChar-1.  The main function of this routine is to help
3203
 * identify clusters which need to be split further, i.e. if
3204
 * numerous training characters have 2 or more features which are
3205
 * contained in the same cluster, then the cluster should be
3206
 * split.
3207
 *
3208
 * @param Clusterer data structure holding cluster tree
3209
 * @param Cluster   cluster containing samples to be tested
3210
 * @param MaxIllegal  max percentage of samples allowed to have
3211
 *        more than 1 feature in the cluster
3212
 * @return true if the cluster should be split, false otherwise.
3213
 */
3214
static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
3215
0
#define ILLEGAL_CHAR 2
3216
0
{
3217
0
  static std::vector<uint8_t> CharFlags;
3218
0
  LIST SearchState;
3219
0
  SAMPLE *Sample;
3220
0
  int32_t CharID;
3221
0
  int32_t NumCharInCluster;
3222
0
  int32_t NumIllegalInCluster;
3223
0
  float PercentIllegal;
3224
3225
  // initial estimate assumes that no illegal chars exist in the cluster
3226
0
  NumCharInCluster = Cluster->SampleCount;
3227
0
  NumIllegalInCluster = 0;
3228
3229
0
  if (Clusterer->NumChar > CharFlags.size()) {
3230
0
    CharFlags.resize(Clusterer->NumChar);
3231
0
  }
3232
3233
0
  for (auto &CharFlag : CharFlags) {
3234
0
    CharFlag = false;
3235
0
  }
3236
3237
  // find each sample in the cluster and check if we have seen it before
3238
0
  InitSampleSearch(SearchState, Cluster);
3239
0
  while ((Sample = NextSample(&SearchState)) != nullptr) {
3240
0
    CharID = Sample->CharID;
3241
0
    if (CharFlags[CharID] == 0) {
3242
0
      CharFlags[CharID] = true;
3243
0
    } else {
3244
0
      if (CharFlags[CharID] == 1) {
3245
0
        NumIllegalInCluster++;
3246
0
        CharFlags[CharID] = ILLEGAL_CHAR;
3247
0
      }
3248
0
      NumCharInCluster--;
3249
0
      PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3250
0
      if (PercentIllegal > MaxIllegal) {
3251
0
        destroy(SearchState);
3252
0
        return true;
3253
0
      }
3254
0
    }
3255
0
  }
3256
0
  return false;
3257
3258
0
} // MultipleCharSamples
3259
3260
/**
3261
 * Compute the inverse of a matrix using LU decomposition with partial pivoting.
3262
 * The return value is the sum of norms of the off-diagonal terms of the
3263
 * product of a and inv. (A measure of the error.)
3264
 */
3265
0
static double InvertMatrix(const float *input, int size, float *inv) {
3266
  // Allocate memory for the 2D arrays.
3267
0
  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3268
0
  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3269
0
  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3270
3271
  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
3272
0
  int row;
3273
0
  int col;
3274
0
  for (row = 0; row < size; row++) {
3275
0
    for (col = 0; col < size; col++) {
3276
0
      U[row][col] = input[row * size + col];
3277
0
      L[row][col] = row == col ? 1.0 : 0.0;
3278
0
      U_inv[row][col] = 0.0;
3279
0
    }
3280
0
  }
3281
3282
  // Compute forward matrix by inversion by LU decomposition of input.
3283
0
  for (col = 0; col < size; ++col) {
3284
    // Find best pivot
3285
0
    int best_row = 0;
3286
0
    double best_pivot = -1.0;
3287
0
    for (row = col; row < size; ++row) {
3288
0
      if (Abs(U[row][col]) > best_pivot) {
3289
0
        best_pivot = Abs(U[row][col]);
3290
0
        best_row = row;
3291
0
      }
3292
0
    }
3293
    // Exchange pivot rows.
3294
0
    if (best_row != col) {
3295
0
      for (int k = 0; k < size; ++k) {
3296
0
        double tmp = U[best_row][k];
3297
0
        U[best_row][k] = U[col][k];
3298
0
        U[col][k] = tmp;
3299
0
        tmp = L[best_row][k];
3300
0
        L[best_row][k] = L[col][k];
3301
0
        L[col][k] = tmp;
3302
0
      }
3303
0
    }
3304
    // Now do the pivot itself.
3305
0
    for (row = col + 1; row < size; ++row) {
3306
0
      double ratio = -U[row][col] / U[col][col];
3307
0
      for (int j = col; j < size; ++j) {
3308
0
        U[row][j] += U[col][j] * ratio;
3309
0
      }
3310
0
      for (int k = 0; k < size; ++k) {
3311
0
        L[row][k] += L[col][k] * ratio;
3312
0
      }
3313
0
    }
3314
0
  }
3315
  // Next invert U.
3316
0
  for (col = 0; col < size; ++col) {
3317
0
    U_inv[col][col] = 1.0 / U[col][col];
3318
0
    for (row = col - 1; row >= 0; --row) {
3319
0
      double total = 0.0;
3320
0
      for (int k = col; k > row; --k) {
3321
0
        total += U[row][k] * U_inv[k][col];
3322
0
      }
3323
0
      U_inv[row][col] = -total / U[row][row];
3324
0
    }
3325
0
  }
3326
  // Now the answer is U_inv.L.
3327
0
  for (row = 0; row < size; row++) {
3328
0
    for (col = 0; col < size; col++) {
3329
0
      double sum = 0.0;
3330
0
      for (int k = row; k < size; ++k) {
3331
0
        sum += U_inv[row][k] * L[k][col];
3332
0
      }
3333
0
      inv[row * size + col] = sum;
3334
0
    }
3335
0
  }
3336
  // Check matrix product.
3337
0
  double error_sum = 0.0;
3338
0
  for (row = 0; row < size; row++) {
3339
0
    for (col = 0; col < size; col++) {
3340
0
      double sum = 0.0;
3341
0
      for (int k = 0; k < size; ++k) {
3342
0
        sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
3343
0
      }
3344
0
      if (row != col) {
3345
0
        error_sum += Abs(sum);
3346
0
      }
3347
0
    }
3348
0
  }
3349
0
  return error_sum;
3350
0
}
3351
3352
} // namespace tesseract