Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/ccstruct/linlsq.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        linlsq.cpp  (Formerly llsq.c)
3
 * Description: Linear Least squares fitting code.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1991, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#include "linlsq.h"
20
#include <cmath> // for std::sqrt
21
#include <cstdio>
22
#include "errcode.h"
23
24
namespace tesseract {
25
26
constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");
27
28
/**********************************************************************
29
 * LLSQ::clear
30
 *
31
 * Function to initialize a LLSQ.
32
 **********************************************************************/
33
34
2.24M
void LLSQ::clear() {  // initialize
35
2.24M
  total_weight = 0.0; // no elements
36
2.24M
  sigx = 0.0;         // update accumulators
37
2.24M
  sigy = 0.0;
38
2.24M
  sigxx = 0.0;
39
2.24M
  sigxy = 0.0;
40
2.24M
  sigyy = 0.0;
41
2.24M
}
42
43
/**********************************************************************
44
 * LLSQ::add
45
 *
46
 * Add an element to the accumulator.
47
 **********************************************************************/
48
49
4.03M
void LLSQ::add(double x, double y) { // add an element
50
4.03M
  total_weight++;                    // count elements
51
4.03M
  sigx += x;                         // update accumulators
52
4.03M
  sigy += y;
53
4.03M
  sigxx += x * x;
54
4.03M
  sigxy += x * y;
55
4.03M
  sigyy += y * y;
56
4.03M
}
57
// Adds an element with a specified weight.
58
1.49G
void LLSQ::add(double x, double y, double weight) {
59
1.49G
  total_weight += weight;
60
1.49G
  sigx += x * weight; // update accumulators
61
1.49G
  sigy += y * weight;
62
1.49G
  sigxx += x * x * weight;
63
1.49G
  sigxy += x * y * weight;
64
1.49G
  sigyy += y * y * weight;
65
1.49G
}
66
// Adds a whole LLSQ.
67
0
void LLSQ::add(const LLSQ &other) {
68
0
  total_weight += other.total_weight;
69
0
  sigx += other.sigx; // update accumulators
70
0
  sigy += other.sigy;
71
0
  sigxx += other.sigxx;
72
0
  sigxy += other.sigxy;
73
0
  sigyy += other.sigyy;
74
0
}
75
76
/**********************************************************************
77
 * LLSQ::remove
78
 *
79
 * Delete an element from the acculuator.
80
 **********************************************************************/
81
82
0
void LLSQ::remove(double x, double y) { // delete an element
83
0
  if (total_weight <= 0.0) {            // illegal
84
0
    EMPTY_LLSQ.error("LLSQ::remove", ABORT);
85
0
  }
86
0
  total_weight--; // count elements
87
0
  sigx -= x;      // update accumulators
88
0
  sigy -= y;
89
0
  sigxx -= x * x;
90
0
  sigxy -= x * y;
91
0
  sigyy -= y * y;
92
0
}
93
94
/**********************************************************************
95
 * LLSQ::m
96
 *
97
 * Return the gradient of the line fit.
98
 **********************************************************************/
99
100
147k
double LLSQ::m() const { // get gradient
101
147k
  double covar = covariance();
102
147k
  double x_var = x_variance();
103
147k
  if (x_var != 0.0) {
104
125k
    return covar / x_var;
105
125k
  } else {
106
21.8k
    return 0.0; // too little
107
21.8k
  }
108
147k
}
109
110
/**********************************************************************
111
 * LLSQ::c
112
 *
113
 * Return the constant of the line fit.
114
 **********************************************************************/
115
116
93.8k
double LLSQ::c(double m) const { // get constant
117
93.8k
  if (total_weight > 0.0) {
118
93.8k
    return (sigy - m * sigx) / total_weight;
119
93.8k
  } else {
120
0
    return 0; // too little
121
0
  }
122
93.8k
}
123
124
/**********************************************************************
125
 * LLSQ::rms
126
 *
127
 * Return the rms error of the fit.
128
 **********************************************************************/
129
130
93.8k
double LLSQ::rms(double m, double c) const { // get error
131
93.8k
  double error;                              // total error
132
133
93.8k
  if (total_weight > 0) {
134
93.8k
    error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);
135
93.8k
    if (error >= 0) {
136
93.3k
      error = std::sqrt(error / total_weight); // sqrt of mean
137
93.3k
    } else {
138
460
      error = 0;
139
460
    }
140
93.8k
  } else {
141
0
    error = 0; // too little
142
0
  }
143
93.8k
  return error;
144
93.8k
}
145
146
/**********************************************************************
147
 * LLSQ::pearson
148
 *
149
 * Return the pearson product moment correlation coefficient.
150
 **********************************************************************/
151
152
0
double LLSQ::pearson() const { // get correlation
153
0
  double r = 0.0;              // Correlation is 0 if insufficient data.
154
155
0
  double covar = covariance();
156
0
  if (covar != 0.0) {
157
0
    double var_product = x_variance() * y_variance();
158
0
    if (var_product > 0.0) {
159
0
      r = covar / std::sqrt(var_product);
160
0
    }
161
0
  }
162
0
  return r;
163
0
}
164
165
// Returns the x,y means as an FCOORD.
166
1.98M
FCOORD LLSQ::mean_point() const {
167
1.98M
  if (total_weight > 0.0) {
168
1.98M
    return FCOORD(sigx / total_weight, sigy / total_weight);
169
1.98M
  } else {
170
0
    return FCOORD(0.0f, 0.0f);
171
0
  }
172
1.98M
}
173
174
// Returns the sqrt of the mean squared error measured perpendicular from the
175
// line through mean_point() in the direction dir.
176
//
177
// Derivation:
178
//   Lemma:  Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
179
//     Let % be dot product and ' be transpose.  Note that:
180
//      Sum[i=1..N] (v % x_i)^2
181
//         = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
182
//     If x_i have average 0 we have:
183
//       = v * (N * COVARIANCE_MATRIX(X)) * v'
184
//     Expanded for the case that k = 2, where we treat the dimensions
185
//     as x_i and y_i, this is:
186
//       = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
187
//  Now, we are trying to calculate the mean squared error, where v is
188
//  perpendicular to our line of interest:
189
//    Mean squared error
190
//      = E [ (v % (x_i - x_avg))) ^2 ]
191
//      = Sum (v % (x_i - x_avg))^2 / N
192
//      = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
193
//      = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
194
//      = code below
195
0
double LLSQ::rms_orth(const FCOORD &dir) const {
196
0
  FCOORD v = !dir;
197
0
  v.normalise();
198
0
  return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +
199
0
                   y_variance() * v.y() * v.y());
200
0
}
201
202
// Returns the direction of the fitted line as a unit vector, using the
203
// least mean squared perpendicular distance. The line runs through the
204
// mean_point, i.e. a point p on the line is given by:
205
// p = mean_point() + lambda * vector_fit() for some real number lambda.
206
// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
207
// and may be negated without changing its meaning.
208
// Fitting a line m + ๐œ†v to a set of N points Pi = (xi, yi), where
209
// m is the mean point (๐, ๐‚) and
210
// v is the direction vector (cos๐œƒ, sin๐œƒ)
211
// The perpendicular distance of each Pi from the line is:
212
// (Pi - m) x v, where x is the scalar cross product.
213
// Total squared error is thus:
214
// E = โˆ‘((xi - ๐)sin๐œƒ - (yi - ๐‚)cos๐œƒ)ยฒ
215
//   = โˆ‘(xi - ๐)ยฒsinยฒ๐œƒ  - 2โˆ‘(xi - ๐)(yi - ๐‚)sin๐œƒ cos๐œƒ + โˆ‘(yi - ๐‚)ยฒcosยฒ๐œƒ
216
//   = NVar(xi)sinยฒ๐œƒ  - 2NCovar(xi, yi)sin๐œƒ cos๐œƒ  + NVar(yi)cosยฒ๐œƒ   (Eq 1)
217
// where Var(xi) is the variance of xi,
218
// and Covar(xi, yi) is the covariance of xi, yi.
219
// Taking the derivative wrt ๐œƒ and setting to 0 to obtain the min/max:
220
// 0 = 2NVar(xi)sin๐œƒ cos๐œƒ -2NCovar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) -2NVar(yi)sin๐œƒ cos๐œƒ
221
// => Covar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) = (Var(xi) - Var(yi))sin๐œƒ cos๐œƒ
222
// Using double angles:
223
// 2Covar(xi, yi)cos2๐œƒ = (Var(xi) - Var(yi))sin2๐œƒ   (Eq 2)
224
// So ๐œƒ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
225
226
// Because it involves 2๐œƒ , Eq 2 has 2 solutions 90 degrees apart, but which
227
// is the min and which is the max? From Eq1:
228
// E/N = Var(xi)sinยฒ๐œƒ  - 2Covar(xi, yi)sin๐œƒ cos๐œƒ  + Var(yi)cosยฒ๐œƒ
229
// and 90 degrees away, using sin/cos equivalences:
230
// E'/N = Var(xi)cosยฒ๐œƒ  + 2Covar(xi, yi)sin๐œƒ cos๐œƒ  + Var(yi)sinยฒ๐œƒ
231
// The second error is smaller (making it the minimum) iff
232
// E'/N < E/N ie:
233
// (Var(xi) - Var(yi))(cosยฒ๐œƒ - sinยฒ๐œƒ) < -4Covar(xi, yi)sin๐œƒ cos๐œƒ
234
// Using double angles:
235
// (Var(xi) - Var(yi))cos2๐œƒ  < -2Covar(xi, yi)sin2๐œƒ  (InEq 1)
236
// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐œƒ  such that:
237
// sgn(cos2๐œƒ) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐œƒ) = sgn(Covar(xi, yi))
238
// so InEq1 can *never* be true, making the atan2 result *always* the min!
239
// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
240
// the 2 solutions have equal error and the inequality is still false.
241
// Therefore the solution really is as trivial as Eq 3.
242
243
// This is equivalent to returning the Principal Component in PCA, or the
244
// eigenvector corresponding to the largest eigenvalue in the covariance
245
// matrix.  However, atan2 is much simpler! The one reference I found that
246
// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
247
// that is still a much more complex derivation. It seems Pearson had already
248
// found this simple solution in 1901.
249
// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
250
0
FCOORD LLSQ::vector_fit() const {
251
0
  double x_var = x_variance();
252
0
  double y_var = y_variance();
253
0
  double covar = covariance();
254
0
  double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
255
0
  FCOORD result(cos(theta), sin(theta));
256
0
  return result;
257
0
}
258
259
} // namespace tesseract