Coverage Report

Created: 2025-07-12 06:44

/src/tesseract/src/ccstruct/statistc.h
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        statistc.h  (Formerly stats.h)
3
 * Description: Class description for STATS class.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1991, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef TESSERACT_CCSTRUCT_STATISTC_H_
20
#define TESSERACT_CCSTRUCT_STATISTC_H_
21
22
#include <cstdio>
23
#include "kdpair.h"
24
#include "scrollview.h"
25
26
namespace tesseract {
27
28
// Simple histogram-based statistics for integer values in a known
29
// range, such that the range is small compared to the number of samples.
30
class TESS_API STATS {
31
public:
32
  // The histogram buckets are in the range
33
  // [min_bucket_value, max_bucket_value].
34
  // Any data under min_bucket value is silently mapped to min_bucket_value,
35
  // and likewise, any data over max_bucket_value is silently mapped to
36
  // max_bucket_value.
37
  // In the internal array, min_bucket_value maps to 0 and
38
  // 1 + max_bucket_value - min_bucket_value to the array size.
39
  STATS(int32_t min_bucket_value, int32_t max_bucket_value);
40
3.04M
  STATS() = default; // empty for arrays
41
42
  ~STATS();
43
44
  // (Re)Sets the range and clears the counts.
45
  // See the constructor for info on max and min values.
46
  bool set_range(int32_t min_bucket_value, int32_t max_bucket_value);
47
48
  void clear(); // empty buckets
49
50
  void add(int32_t value, int32_t count);
51
52
  // "Accessors" return various statistics on the data.
53
  int32_t mode() const; // get mode of samples
54
  double mean() const;  // get mean of samples
55
  double sd() const;    // standard deviation
56
  // Returns the fractile value such that frac fraction (in [0,1]) of samples
57
  // has a value less than the return value.
58
  double ile(double frac) const;
59
  // Returns the minimum used entry in the histogram (ie the minimum of the
60
  // data, NOT the minimum of the supplied range, nor is it an index.)
61
  // Would normally be called min(), but that is a reserved word in VC++.
62
  int32_t min_bucket() const; // Find min
63
  // Returns the maximum used entry in the histogram (ie the maximum of the
64
  // data, NOT the maximum of the supplied range, nor is it an index.)
65
  int32_t max_bucket() const; // Find max
66
  // Finds a more useful estimate of median than ile(0.5).
67
  // Overcomes a problem with ile() - if the samples are, for example,
68
  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
69
  // between 6 and 13 = 9.5
70
  double median() const; // get median of samples
71
  // Returns the count of the given value.
72
583M
  int32_t pile_count(int32_t value) const {
73
583M
    if (buckets_ == nullptr) {
74
0
      return 0;
75
0
    }
76
583M
    if (value <= rangemin_) {
77
5.19M
      return buckets_[0];
78
5.19M
    }
79
578M
    if (value >= rangemax_) {
80
3.92M
      return buckets_[rangemax_ - rangemin_];
81
3.92M
    }
82
574M
    return buckets_[value - rangemin_];
83
578M
  }
84
  // Returns the total count of all buckets.
85
15.1M
  int32_t get_total() const {
86
15.1M
    return total_count_; // total of all piles
87
15.1M
  }
88
  // Returns true if x is a local min.
89
  bool local_min(int32_t x) const;
90
91
  // Apply a triangular smoothing filter to the stats.
92
  // This makes the modes a bit more useful.
93
  // The factor gives the height of the triangle, i.e. the weight of the
94
  // centre.
95
  void smooth(int32_t factor);
96
97
  // Cluster the samples into max_cluster clusters.
98
  // Each call runs one iteration. The array of clusters must be
99
  // max_clusters+1 in size as cluster 0 is used to indicate which samples
100
  // have been used.
101
  // The return value is the current number of clusters.
102
  int32_t cluster(float lower, // thresholds
103
                  float upper,
104
                  float multiple,       // distance threshold
105
                  int32_t max_clusters, // max no to make
106
                  STATS *clusters);     // array of clusters
107
108
  // Finds (at most) the top max_modes modes, well actually the whole peak
109
  // around each mode, returning them in the given modes vector as a <mean of
110
  // peak, total count of peak> pair in order of decreasing total count. Since
111
  // the mean is the key and the count the data in the pair, a single call to
112
  // sort on the output will re-sort by increasing mean of peak if that is more
113
  // useful than decreasing total count. Returns the actual number of modes
114
  // found.
115
  int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;
116
117
  // Prints a summary and table of the histogram.
118
  void print() const;
119
  // Prints summary stats only of the histogram.
120
  void print_summary() const;
121
122
#ifndef GRAPHICS_DISABLED
123
  // Draws the histogram as a series of rectangles.
124
  void plot(ScrollView *window,              // window to draw in
125
            float xorigin,                   // origin of histo
126
            float yorigin,                   // gram
127
            float xscale,                    // size of one unit
128
            float yscale,                    // size of one uint
129
            ScrollView::Color colour) const; // colour to draw in
130
131
  // Draws a line graph of the histogram.
132
  void plotline(ScrollView *window,              // window to draw in
133
                float xorigin,                   // origin of histo
134
                float yorigin,                   // gram
135
                float xscale,                    // size of one unit
136
                float yscale,                    // size of one uint
137
                ScrollView::Color colour) const; // colour to draw in
138
#endif                                           // !GRAPHICS_DISABLED
139
140
private:
141
  int32_t rangemin_ = 0; // min of range
142
  int32_t rangemax_ = 0;       // max of range
143
  int32_t total_count_ = 0;    // no of samples
144
  int32_t *buckets_ = nullptr; // array of cells
145
};
146
147
} // namespace tesseract
148
149
#endif // TESSERACT_CCSTRUCT_STATISTC_H_