/src/tesseract/src/ccstruct/statistc.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: statistc.h (Formerly stats.h) |
3 | | * Description: Class description for STATS class. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1991, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef TESSERACT_CCSTRUCT_STATISTC_H_ |
20 | | #define TESSERACT_CCSTRUCT_STATISTC_H_ |
21 | | |
22 | | #include <cstdio> |
23 | | #include "kdpair.h" |
24 | | #include "scrollview.h" |
25 | | |
26 | | namespace tesseract { |
27 | | |
28 | | // Simple histogram-based statistics for integer values in a known |
29 | | // range, such that the range is small compared to the number of samples. |
30 | | class TESS_API STATS { |
31 | | public: |
32 | | // The histogram buckets are in the range |
33 | | // [min_bucket_value, max_bucket_value]. |
34 | | // Any data under min_bucket value is silently mapped to min_bucket_value, |
35 | | // and likewise, any data over max_bucket_value is silently mapped to |
36 | | // max_bucket_value. |
37 | | // In the internal array, min_bucket_value maps to 0 and |
38 | | // 1 + max_bucket_value - min_bucket_value to the array size. |
39 | | STATS(int32_t min_bucket_value, int32_t max_bucket_value); |
40 | 3.04M | STATS() = default; // empty for arrays |
41 | | |
42 | | ~STATS(); |
43 | | |
44 | | // (Re)Sets the range and clears the counts. |
45 | | // See the constructor for info on max and min values. |
46 | | bool set_range(int32_t min_bucket_value, int32_t max_bucket_value); |
47 | | |
48 | | void clear(); // empty buckets |
49 | | |
50 | | void add(int32_t value, int32_t count); |
51 | | |
52 | | // "Accessors" return various statistics on the data. |
53 | | int32_t mode() const; // get mode of samples |
54 | | double mean() const; // get mean of samples |
55 | | double sd() const; // standard deviation |
56 | | // Returns the fractile value such that frac fraction (in [0,1]) of samples |
57 | | // has a value less than the return value. |
58 | | double ile(double frac) const; |
59 | | // Returns the minimum used entry in the histogram (ie the minimum of the |
60 | | // data, NOT the minimum of the supplied range, nor is it an index.) |
61 | | // Would normally be called min(), but that is a reserved word in VC++. |
62 | | int32_t min_bucket() const; // Find min |
63 | | // Returns the maximum used entry in the histogram (ie the maximum of the |
64 | | // data, NOT the maximum of the supplied range, nor is it an index.) |
65 | | int32_t max_bucket() const; // Find max |
66 | | // Finds a more useful estimate of median than ile(0.5). |
67 | | // Overcomes a problem with ile() - if the samples are, for example, |
68 | | // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway |
69 | | // between 6 and 13 = 9.5 |
70 | | double median() const; // get median of samples |
71 | | // Returns the count of the given value. |
72 | 583M | int32_t pile_count(int32_t value) const { |
73 | 583M | if (buckets_ == nullptr) { |
74 | 0 | return 0; |
75 | 0 | } |
76 | 583M | if (value <= rangemin_) { |
77 | 5.19M | return buckets_[0]; |
78 | 5.19M | } |
79 | 578M | if (value >= rangemax_) { |
80 | 3.92M | return buckets_[rangemax_ - rangemin_]; |
81 | 3.92M | } |
82 | 574M | return buckets_[value - rangemin_]; |
83 | 578M | } |
84 | | // Returns the total count of all buckets. |
85 | 15.1M | int32_t get_total() const { |
86 | 15.1M | return total_count_; // total of all piles |
87 | 15.1M | } |
88 | | // Returns true if x is a local min. |
89 | | bool local_min(int32_t x) const; |
90 | | |
91 | | // Apply a triangular smoothing filter to the stats. |
92 | | // This makes the modes a bit more useful. |
93 | | // The factor gives the height of the triangle, i.e. the weight of the |
94 | | // centre. |
95 | | void smooth(int32_t factor); |
96 | | |
97 | | // Cluster the samples into max_cluster clusters. |
98 | | // Each call runs one iteration. The array of clusters must be |
99 | | // max_clusters+1 in size as cluster 0 is used to indicate which samples |
100 | | // have been used. |
101 | | // The return value is the current number of clusters. |
102 | | int32_t cluster(float lower, // thresholds |
103 | | float upper, |
104 | | float multiple, // distance threshold |
105 | | int32_t max_clusters, // max no to make |
106 | | STATS *clusters); // array of clusters |
107 | | |
108 | | // Finds (at most) the top max_modes modes, well actually the whole peak |
109 | | // around each mode, returning them in the given modes vector as a <mean of |
110 | | // peak, total count of peak> pair in order of decreasing total count. Since |
111 | | // the mean is the key and the count the data in the pair, a single call to |
112 | | // sort on the output will re-sort by increasing mean of peak if that is more |
113 | | // useful than decreasing total count. Returns the actual number of modes |
114 | | // found. |
115 | | int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const; |
116 | | |
117 | | // Prints a summary and table of the histogram. |
118 | | void print() const; |
119 | | // Prints summary stats only of the histogram. |
120 | | void print_summary() const; |
121 | | |
122 | | #ifndef GRAPHICS_DISABLED |
123 | | // Draws the histogram as a series of rectangles. |
124 | | void plot(ScrollView *window, // window to draw in |
125 | | float xorigin, // origin of histo |
126 | | float yorigin, // gram |
127 | | float xscale, // size of one unit |
128 | | float yscale, // size of one uint |
129 | | ScrollView::Color colour) const; // colour to draw in |
130 | | |
131 | | // Draws a line graph of the histogram. |
132 | | void plotline(ScrollView *window, // window to draw in |
133 | | float xorigin, // origin of histo |
134 | | float yorigin, // gram |
135 | | float xscale, // size of one unit |
136 | | float yscale, // size of one uint |
137 | | ScrollView::Color colour) const; // colour to draw in |
138 | | #endif // !GRAPHICS_DISABLED |
139 | | |
140 | | private: |
141 | | int32_t rangemin_ = 0; // min of range |
142 | | int32_t rangemax_ = 0; // max of range |
143 | | int32_t total_count_ = 0; // no of samples |
144 | | int32_t *buckets_ = nullptr; // array of cells |
145 | | }; |
146 | | |
147 | | } // namespace tesseract |
148 | | |
149 | | #endif // TESSERACT_CCSTRUCT_STATISTC_H_ |