/src/tesseract/src/ccstruct/linlsq.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: linlsq.cpp (Formerly llsq.c) |
3 | | * Description: Linear Least squares fitting code. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1991, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #include "linlsq.h" |
20 | | #include <cmath> // for std::sqrt |
21 | | #include <cstdio> |
22 | | #include "errcode.h" |
23 | | |
24 | | namespace tesseract { |
25 | | |
26 | | constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ"); |
27 | | |
28 | | /********************************************************************** |
29 | | * LLSQ::clear |
30 | | * |
31 | | * Function to initialize a LLSQ. |
32 | | **********************************************************************/ |
33 | | |
34 | 2.24M | void LLSQ::clear() { // initialize |
35 | 2.24M | total_weight = 0.0; // no elements |
36 | 2.24M | sigx = 0.0; // update accumulators |
37 | 2.24M | sigy = 0.0; |
38 | 2.24M | sigxx = 0.0; |
39 | 2.24M | sigxy = 0.0; |
40 | 2.24M | sigyy = 0.0; |
41 | 2.24M | } |
42 | | |
43 | | /********************************************************************** |
44 | | * LLSQ::add |
45 | | * |
46 | | * Add an element to the accumulator. |
47 | | **********************************************************************/ |
48 | | |
49 | 4.03M | void LLSQ::add(double x, double y) { // add an element |
50 | 4.03M | total_weight++; // count elements |
51 | 4.03M | sigx += x; // update accumulators |
52 | 4.03M | sigy += y; |
53 | 4.03M | sigxx += x * x; |
54 | 4.03M | sigxy += x * y; |
55 | 4.03M | sigyy += y * y; |
56 | 4.03M | } |
57 | | // Adds an element with a specified weight. |
58 | 1.49G | void LLSQ::add(double x, double y, double weight) { |
59 | 1.49G | total_weight += weight; |
60 | 1.49G | sigx += x * weight; // update accumulators |
61 | 1.49G | sigy += y * weight; |
62 | 1.49G | sigxx += x * x * weight; |
63 | 1.49G | sigxy += x * y * weight; |
64 | 1.49G | sigyy += y * y * weight; |
65 | 1.49G | } |
66 | | // Adds a whole LLSQ. |
67 | 0 | void LLSQ::add(const LLSQ &other) { |
68 | 0 | total_weight += other.total_weight; |
69 | 0 | sigx += other.sigx; // update accumulators |
70 | 0 | sigy += other.sigy; |
71 | 0 | sigxx += other.sigxx; |
72 | 0 | sigxy += other.sigxy; |
73 | 0 | sigyy += other.sigyy; |
74 | 0 | } |
75 | | |
76 | | /********************************************************************** |
77 | | * LLSQ::remove |
78 | | * |
79 | | * Delete an element from the acculuator. |
80 | | **********************************************************************/ |
81 | | |
82 | 0 | void LLSQ::remove(double x, double y) { // delete an element |
83 | 0 | if (total_weight <= 0.0) { // illegal |
84 | 0 | EMPTY_LLSQ.error("LLSQ::remove", ABORT); |
85 | 0 | } |
86 | 0 | total_weight--; // count elements |
87 | 0 | sigx -= x; // update accumulators |
88 | 0 | sigy -= y; |
89 | 0 | sigxx -= x * x; |
90 | 0 | sigxy -= x * y; |
91 | 0 | sigyy -= y * y; |
92 | 0 | } |
93 | | |
94 | | /********************************************************************** |
95 | | * LLSQ::m |
96 | | * |
97 | | * Return the gradient of the line fit. |
98 | | **********************************************************************/ |
99 | | |
100 | 147k | double LLSQ::m() const { // get gradient |
101 | 147k | double covar = covariance(); |
102 | 147k | double x_var = x_variance(); |
103 | 147k | if (x_var != 0.0) { |
104 | 125k | return covar / x_var; |
105 | 125k | } else { |
106 | 21.8k | return 0.0; // too little |
107 | 21.8k | } |
108 | 147k | } |
109 | | |
110 | | /********************************************************************** |
111 | | * LLSQ::c |
112 | | * |
113 | | * Return the constant of the line fit. |
114 | | **********************************************************************/ |
115 | | |
116 | 93.8k | double LLSQ::c(double m) const { // get constant |
117 | 93.8k | if (total_weight > 0.0) { |
118 | 93.8k | return (sigy - m * sigx) / total_weight; |
119 | 93.8k | } else { |
120 | 0 | return 0; // too little |
121 | 0 | } |
122 | 93.8k | } |
123 | | |
124 | | /********************************************************************** |
125 | | * LLSQ::rms |
126 | | * |
127 | | * Return the rms error of the fit. |
128 | | **********************************************************************/ |
129 | | |
130 | 93.8k | double LLSQ::rms(double m, double c) const { // get error |
131 | 93.8k | double error; // total error |
132 | | |
133 | 93.8k | if (total_weight > 0) { |
134 | 93.8k | error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy); |
135 | 93.8k | if (error >= 0) { |
136 | 93.3k | error = std::sqrt(error / total_weight); // sqrt of mean |
137 | 93.3k | } else { |
138 | 460 | error = 0; |
139 | 460 | } |
140 | 93.8k | } else { |
141 | 0 | error = 0; // too little |
142 | 0 | } |
143 | 93.8k | return error; |
144 | 93.8k | } |
145 | | |
146 | | /********************************************************************** |
147 | | * LLSQ::pearson |
148 | | * |
149 | | * Return the pearson product moment correlation coefficient. |
150 | | **********************************************************************/ |
151 | | |
152 | 0 | double LLSQ::pearson() const { // get correlation |
153 | 0 | double r = 0.0; // Correlation is 0 if insufficient data. |
154 | |
|
155 | 0 | double covar = covariance(); |
156 | 0 | if (covar != 0.0) { |
157 | 0 | double var_product = x_variance() * y_variance(); |
158 | 0 | if (var_product > 0.0) { |
159 | 0 | r = covar / std::sqrt(var_product); |
160 | 0 | } |
161 | 0 | } |
162 | 0 | return r; |
163 | 0 | } |
164 | | |
165 | | // Returns the x,y means as an FCOORD. |
166 | 1.98M | FCOORD LLSQ::mean_point() const { |
167 | 1.98M | if (total_weight > 0.0) { |
168 | 1.98M | return FCOORD(sigx / total_weight, sigy / total_weight); |
169 | 1.98M | } else { |
170 | 0 | return FCOORD(0.0f, 0.0f); |
171 | 0 | } |
172 | 1.98M | } |
173 | | |
174 | | // Returns the sqrt of the mean squared error measured perpendicular from the |
175 | | // line through mean_point() in the direction dir. |
176 | | // |
177 | | // Derivation: |
178 | | // Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices). |
179 | | // Let % be dot product and ' be transpose. Note that: |
180 | | // Sum[i=1..N] (v % x_i)^2 |
181 | | // = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v' |
182 | | // If x_i have average 0 we have: |
183 | | // = v * (N * COVARIANCE_MATRIX(X)) * v' |
184 | | // Expanded for the case that k = 2, where we treat the dimensions |
185 | | // as x_i and y_i, this is: |
186 | | // = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v' |
187 | | // Now, we are trying to calculate the mean squared error, where v is |
188 | | // perpendicular to our line of interest: |
189 | | // Mean squared error |
190 | | // = E [ (v % (x_i - x_avg))) ^2 ] |
191 | | // = Sum (v % (x_i - x_avg))^2 / N |
192 | | // = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v' |
193 | | // = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v' |
194 | | // = code below |
195 | 0 | double LLSQ::rms_orth(const FCOORD &dir) const { |
196 | 0 | FCOORD v = !dir; |
197 | 0 | v.normalise(); |
198 | 0 | return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() + |
199 | 0 | y_variance() * v.y() * v.y()); |
200 | 0 | } |
201 | | |
202 | | // Returns the direction of the fitted line as a unit vector, using the |
203 | | // least mean squared perpendicular distance. The line runs through the |
204 | | // mean_point, i.e. a point p on the line is given by: |
205 | | // p = mean_point() + lambda * vector_fit() for some real number lambda. |
206 | | // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous |
207 | | // and may be negated without changing its meaning. |
208 | | // Fitting a line m + ๐v to a set of N points Pi = (xi, yi), where |
209 | | // m is the mean point (๐, ๐) and |
210 | | // v is the direction vector (cos๐, sin๐) |
211 | | // The perpendicular distance of each Pi from the line is: |
212 | | // (Pi - m) x v, where x is the scalar cross product. |
213 | | // Total squared error is thus: |
214 | | // E = โ((xi - ๐)sin๐ - (yi - ๐)cos๐)ยฒ |
215 | | // = โ(xi - ๐)ยฒsinยฒ๐ - 2โ(xi - ๐)(yi - ๐)sin๐ cos๐ + โ(yi - ๐)ยฒcosยฒ๐ |
216 | | // = NVar(xi)sinยฒ๐ - 2NCovar(xi, yi)sin๐ cos๐ + NVar(yi)cosยฒ๐ (Eq 1) |
217 | | // where Var(xi) is the variance of xi, |
218 | | // and Covar(xi, yi) is the covariance of xi, yi. |
219 | | // Taking the derivative wrt ๐ and setting to 0 to obtain the min/max: |
220 | | // 0 = 2NVar(xi)sin๐ cos๐ -2NCovar(xi, yi)(cosยฒ๐ - sinยฒ๐) -2NVar(yi)sin๐ cos๐ |
221 | | // => Covar(xi, yi)(cosยฒ๐ - sinยฒ๐) = (Var(xi) - Var(yi))sin๐ cos๐ |
222 | | // Using double angles: |
223 | | // 2Covar(xi, yi)cos2๐ = (Var(xi) - Var(yi))sin2๐ (Eq 2) |
224 | | // So ๐ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3) |
225 | | |
226 | | // Because it involves 2๐ , Eq 2 has 2 solutions 90 degrees apart, but which |
227 | | // is the min and which is the max? From Eq1: |
228 | | // E/N = Var(xi)sinยฒ๐ - 2Covar(xi, yi)sin๐ cos๐ + Var(yi)cosยฒ๐ |
229 | | // and 90 degrees away, using sin/cos equivalences: |
230 | | // E'/N = Var(xi)cosยฒ๐ + 2Covar(xi, yi)sin๐ cos๐ + Var(yi)sinยฒ๐ |
231 | | // The second error is smaller (making it the minimum) iff |
232 | | // E'/N < E/N ie: |
233 | | // (Var(xi) - Var(yi))(cosยฒ๐ - sinยฒ๐) < -4Covar(xi, yi)sin๐ cos๐ |
234 | | // Using double angles: |
235 | | // (Var(xi) - Var(yi))cos2๐ < -2Covar(xi, yi)sin2๐ (InEq 1) |
236 | | // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐ such that: |
237 | | // sgn(cos2๐) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐) = sgn(Covar(xi, yi)) |
238 | | // so InEq1 can *never* be true, making the atan2 result *always* the min! |
239 | | // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi), |
240 | | // the 2 solutions have equal error and the inequality is still false. |
241 | | // Therefore the solution really is as trivial as Eq 3. |
242 | | |
243 | | // This is equivalent to returning the Principal Component in PCA, or the |
244 | | // eigenvector corresponding to the largest eigenvalue in the covariance |
245 | | // matrix. However, atan2 is much simpler! The one reference I found that |
246 | | // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but |
247 | | // that is still a much more complex derivation. It seems Pearson had already |
248 | | // found this simple solution in 1901. |
249 | | // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559 |
250 | 0 | FCOORD LLSQ::vector_fit() const { |
251 | 0 | double x_var = x_variance(); |
252 | 0 | double y_var = y_variance(); |
253 | 0 | double covar = covariance(); |
254 | 0 | double theta = 0.5 * atan2(2.0 * covar, x_var - y_var); |
255 | 0 | FCOORD result(cos(theta), sin(theta)); |
256 | 0 | return result; |
257 | 0 | } |
258 | | |
259 | | } // namespace tesseract |