/src/tesseract/src/ccstruct/linlsq.cpp

Source (jump to first uncovered line)
/**********************************************************************
 * File:        linlsq.cpp  (Formerly llsq.c)
 * Description: Linear Least squares fitting code.
 * Author:      Ray Smith
 *
 * (C) Copyright 1991, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include "linlsq.h"
#include <cmath> // for std::sqrt
#include <cstdio>
#include "errcode.h"

namespace tesseract {

constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");

/**********************************************************************
 * LLSQ::clear
 *
 * Function to initialize a LLSQ.
 **********************************************************************/

void LLSQ::clear() {  // initialize
  total_weight = 0.0; // no elements
  sigx = 0.0;         // update accumulators
  sigy = 0.0;
  sigxx = 0.0;
  sigxy = 0.0;
  sigyy = 0.0;
}

/**********************************************************************
 * LLSQ::add
 *
 * Add an element to the accumulator.
 **********************************************************************/

void LLSQ::add(double x, double y) { // add an element
  total_weight++;                    // count elements
  sigx += x;                         // update accumulators
  sigy += y;
  sigxx += x * x;
  sigxy += x * y;
  sigyy += y * y;
}
// Adds an element with a specified weight.
void LLSQ::add(double x, double y, double weight) {
  total_weight += weight;
  sigx += x * weight; // update accumulators
  sigy += y * weight;
  sigxx += x * x * weight;
  sigxy += x * y * weight;
  sigyy += y * y * weight;
}
// Adds a whole LLSQ.
void LLSQ::add(const LLSQ &other) {
  total_weight += other.total_weight;
  sigx += other.sigx; // update accumulators
  sigy += other.sigy;
  sigxx += other.sigxx;
  sigxy += other.sigxy;
  sigyy += other.sigyy;
}

/**********************************************************************
 * LLSQ::remove
 *
 * Delete an element from the acculuator.
 **********************************************************************/

void LLSQ::remove(double x, double y) { // delete an element
  if (total_weight <= 0.0) {            // illegal
    EMPTY_LLSQ.error("LLSQ::remove", ABORT);
  }
  total_weight--; // count elements
  sigx -= x;      // update accumulators
  sigy -= y;
  sigxx -= x * x;
  sigxy -= x * y;
  sigyy -= y * y;
}

/**********************************************************************
 * LLSQ::m
 *
 * Return the gradient of the line fit.
 **********************************************************************/

double LLSQ::m() const { // get gradient
  double covar = covariance();
  double x_var = x_variance();
  if (x_var != 0.0) {
    return covar / x_var;
  } else {
    return 0.0; // too little
  }
}

/**********************************************************************
 * LLSQ::c
 *
 * Return the constant of the line fit.
 **********************************************************************/

double LLSQ::c(double m) const { // get constant
  if (total_weight > 0.0) {
    return (sigy - m * sigx) / total_weight;
  } else {
    return 0; // too little
  }
}

/**********************************************************************
 * LLSQ::rms
 *
 * Return the rms error of the fit.
 **********************************************************************/

double LLSQ::rms(double m, double c) const { // get error
  double error;                              // total error

  if (total_weight > 0) {
    error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);
    if (error >= 0) {
      error = std::sqrt(error / total_weight); // sqrt of mean
    } else {
      error = 0;
    }
  } else {
    error = 0; // too little
  }
  return error;
}

/**********************************************************************
 * LLSQ::pearson
 *
 * Return the pearson product moment correlation coefficient.
 **********************************************************************/

double LLSQ::pearson() const { // get correlation
  double r = 0.0;              // Correlation is 0 if insufficient data.

  double covar = covariance();
  if (covar != 0.0) {
    double var_product = x_variance() * y_variance();
    if (var_product > 0.0) {
      r = covar / std::sqrt(var_product);
    }
  }
  return r;
}

// Returns the x,y means as an FCOORD.
FCOORD LLSQ::mean_point() const {
  if (total_weight > 0.0) {
    return FCOORD(sigx / total_weight, sigy / total_weight);
  } else {
    return FCOORD(0.0f, 0.0f);
  }
}

// Returns the sqrt of the mean squared error measured perpendicular from the
// line through mean_point() in the direction dir.
//
// Derivation:
//   Lemma:  Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
//     Let % be dot product and ' be transpose.  Note that:
//      Sum[i=1..N] (v % x_i)^2
//         = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
//     If x_i have average 0 we have:
//       = v * (N * COVARIANCE_MATRIX(X)) * v'
//     Expanded for the case that k = 2, where we treat the dimensions
//     as x_i and y_i, this is:
//       = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
//  Now, we are trying to calculate the mean squared error, where v is
//  perpendicular to our line of interest:
//    Mean squared error
//      = E [ (v % (x_i - x_avg))) ^2 ]
//      = Sum (v % (x_i - x_avg))^2 / N
//      = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
//      = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
//      = code below
double LLSQ::rms_orth(const FCOORD &dir) const {
  FCOORD v = !dir;
  v.normalise();
  return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +
                   y_variance() * v.y() * v.y());
}

// Returns the direction of the fitted line as a unit vector, using the
// least mean squared perpendicular distance. The line runs through the
// mean_point, i.e. a point p on the line is given by:
// p = mean_point() + lambda * vector_fit() for some real number lambda.
// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
// and may be negated without changing its meaning.
// Fitting a line m + 𝜆v to a set of N points Pi = (xi, yi), where
// m is the mean point (𝝁, 𝝂) and
// v is the direction vector (cos𝜃, sin𝜃)
// The perpendicular distance of each Pi from the line is:
// (Pi - m) x v, where x is the scalar cross product.
// Total squared error is thus:
// E = ∑((xi - 𝝁)sin𝜃 - (yi - 𝝂)cos𝜃)²
//   = ∑(xi - 𝝁)²sin²𝜃  - 2∑(xi - 𝝁)(yi - 𝝂)sin𝜃 cos𝜃 + ∑(yi - 𝝂)²cos²𝜃
//   = NVar(xi)sin²𝜃  - 2NCovar(xi, yi)sin𝜃 cos𝜃  + NVar(yi)cos²𝜃   (Eq 1)
// where Var(xi) is the variance of xi,
// and Covar(xi, yi) is the covariance of xi, yi.
// Taking the derivative wrt 𝜃 and setting to 0 to obtain the min/max:
// 0 = 2NVar(xi)sin𝜃 cos𝜃 -2NCovar(xi, yi)(cos²𝜃 - sin²𝜃) -2NVar(yi)sin𝜃 cos𝜃
// => Covar(xi, yi)(cos²𝜃 - sin²𝜃) = (Var(xi) - Var(yi))sin𝜃 cos𝜃
// Using double angles:
// 2Covar(xi, yi)cos2𝜃 = (Var(xi) - Var(yi))sin2𝜃   (Eq 2)
// So 𝜃 = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)

// Because it involves 2𝜃 , Eq 2 has 2 solutions 90 degrees apart, but which
// is the min and which is the max? From Eq1:
// E/N = Var(xi)sin²𝜃  - 2Covar(xi, yi)sin𝜃 cos𝜃  + Var(yi)cos²𝜃
// and 90 degrees away, using sin/cos equivalences:
// E'/N = Var(xi)cos²𝜃  + 2Covar(xi, yi)sin𝜃 cos𝜃  + Var(yi)sin²𝜃
// The second error is smaller (making it the minimum) iff
// E'/N < E/N ie:
// (Var(xi) - Var(yi))(cos²𝜃 - sin²𝜃) < -4Covar(xi, yi)sin𝜃 cos𝜃
// Using double angles:
// (Var(xi) - Var(yi))cos2𝜃  < -2Covar(xi, yi)sin2𝜃  (InEq 1)
// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2𝜃  such that:
// sgn(cos2𝜃) = sgn(Var(xi) - Var(yi)) and sgn(sin2𝜃) = sgn(Covar(xi, yi))
// so InEq1 can *never* be true, making the atan2 result *always* the min!
// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
// the 2 solutions have equal error and the inequality is still false.
// Therefore the solution really is as trivial as Eq 3.

// This is equivalent to returning the Principal Component in PCA, or the
// eigenvector corresponding to the largest eigenvalue in the covariance
// matrix.  However, atan2 is much simpler! The one reference I found that
// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
// that is still a much more complex derivation. It seems Pearson had already
// found this simple solution in 1901.
// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
FCOORD LLSQ::vector_fit() const {
  double x_var = x_variance();
  double y_var = y_variance();
  double covar = covariance();
  double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
  FCOORD result(cos(theta), sin(theta));
  return result;
}

} // namespace tesseract

Coverage Report

Created: 2025-07-23 07:12

Line	Count	Source (jump to first uncovered line)
1		/**********************************************************************
2		* File: linlsq.cpp (Formerly llsq.c)
3		* Description: Linear Least squares fitting code.
4		* Author: Ray Smith
5		*
6		* (C) Copyright 1991, Hewlett-Packard Ltd.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		*
17		**********************************************************************/
18
19		#include "linlsq.h"
20		#include <cmath> // for std::sqrt
21		#include <cstdio>
22		#include "errcode.h"
23
24		namespace tesseract {
25
26		constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");
27
28		/**********************************************************************
29		* LLSQ::clear
30		*
31		* Function to initialize a LLSQ.
32		**********************************************************************/
33
34	2.24M	void LLSQ::clear() { // initialize
35	2.24M	total_weight = 0.0; // no elements
36	2.24M	sigx = 0.0; // update accumulators
37	2.24M	sigy = 0.0;
38	2.24M	sigxx = 0.0;
39	2.24M	sigxy = 0.0;
40	2.24M	sigyy = 0.0;
41	2.24M	}
42
43		/**********************************************************************
44		* LLSQ::add
45		*
46		* Add an element to the accumulator.
47		**********************************************************************/
48
49	4.03M	void LLSQ::add(double x, double y) { // add an element
50	4.03M	total_weight++; // count elements
51	4.03M	sigx += x; // update accumulators
52	4.03M	sigy += y;
53	4.03M	sigxx += x * x;
54	4.03M	sigxy += x * y;
55	4.03M	sigyy += y * y;
56	4.03M	}
57		// Adds an element with a specified weight.
58	1.49G	void LLSQ::add(double x, double y, double weight) {
59	1.49G	total_weight += weight;
60	1.49G	sigx += x * weight; // update accumulators
61	1.49G	sigy += y * weight;
62	1.49G	sigxx += x * x * weight;
63	1.49G	sigxy += x * y * weight;
64	1.49G	sigyy += y * y * weight;
65	1.49G	}
66		// Adds a whole LLSQ.
67	0	void LLSQ::add(const LLSQ &other) {
68	0	total_weight += other.total_weight;
69	0	sigx += other.sigx; // update accumulators
70	0	sigy += other.sigy;
71	0	sigxx += other.sigxx;
72	0	sigxy += other.sigxy;
73	0	sigyy += other.sigyy;
74	0	}
75
76		/**********************************************************************
77		* LLSQ::remove
78		*
79		* Delete an element from the acculuator.
80		**********************************************************************/
81
82	0	void LLSQ::remove(double x, double y) { // delete an element
83	0	if (total_weight <= 0.0) { // illegal
84	0	EMPTY_LLSQ.error("LLSQ::remove", ABORT);
85	0	}
86	0	total_weight--; // count elements
87	0	sigx -= x; // update accumulators
88	0	sigy -= y;
89	0	sigxx -= x * x;
90	0	sigxy -= x * y;
91	0	sigyy -= y * y;
92	0	}
93
94		/**********************************************************************
95		* LLSQ::m
96		*
97		* Return the gradient of the line fit.
98		**********************************************************************/
99
100	147k	double LLSQ::m() const { // get gradient
101	147k	double covar = covariance();
102	147k	double x_var = x_variance();
103	147k	if (x_var != 0.0) {
104	125k	return covar / x_var;
105	125k	} else {
106	21.8k	return 0.0; // too little
107	21.8k	}
108	147k	}
109
110		/**********************************************************************
111		* LLSQ::c
112		*
113		* Return the constant of the line fit.
114		**********************************************************************/
115
116	93.8k	double LLSQ::c(double m) const { // get constant
117	93.8k	if (total_weight > 0.0) {
118	93.8k	return (sigy - m * sigx) / total_weight;
119	93.8k	} else {
120	0	return 0; // too little
121	0	}
122	93.8k	}
123
124		/**********************************************************************
125		* LLSQ::rms
126		*
127		* Return the rms error of the fit.
128		**********************************************************************/
129
130	93.8k	double LLSQ::rms(double m, double c) const { // get error
131	93.8k	double error; // total error
132
133	93.8k	if (total_weight > 0) {
134	93.8k	error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);
135	93.8k	if (error >= 0) {
136	93.3k	error = std::sqrt(error / total_weight); // sqrt of mean
137	93.3k	} else {
138	460	error = 0;
139	460	}
140	93.8k	} else {
141	0	error = 0; // too little
142	0	}
143	93.8k	return error;
144	93.8k	}
145
146		/**********************************************************************
147		* LLSQ::pearson
148		*
149		* Return the pearson product moment correlation coefficient.
150		**********************************************************************/
151
152	0	double LLSQ::pearson() const { // get correlation
153	0	double r = 0.0; // Correlation is 0 if insufficient data.
154
155	0	double covar = covariance();
156	0	if (covar != 0.0) {
157	0	double var_product = x_variance() * y_variance();
158	0	if (var_product > 0.0) {
159	0	r = covar / std::sqrt(var_product);
160	0	}
161	0	}
162	0	return r;
163	0	}
164
165		// Returns the x,y means as an FCOORD.
166	1.98M	FCOORD LLSQ::mean_point() const {
167	1.98M	if (total_weight > 0.0) {
168	1.98M	return FCOORD(sigx / total_weight, sigy / total_weight);
169	1.98M	} else {
170	0	return FCOORD(0.0f, 0.0f);
171	0	}
172	1.98M	}
173
174		// Returns the sqrt of the mean squared error measured perpendicular from the
175		// line through mean_point() in the direction dir.
176		//
177		// Derivation:
178		// Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
179		// Let % be dot product and ' be transpose. Note that:
180		// Sum[i=1..N] (v % x_i)^2
181		// = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
182		// If x_i have average 0 we have:
183		// = v * (N * COVARIANCE_MATRIX(X)) * v'
184		// Expanded for the case that k = 2, where we treat the dimensions
185		// as x_i and y_i, this is:
186		// = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
187		// Now, we are trying to calculate the mean squared error, where v is
188		// perpendicular to our line of interest:
189		// Mean squared error
190		// = E [ (v % (x_i - x_avg))) ^2 ]
191		// = Sum (v % (x_i - x_avg))^2 / N
192		// = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
193		// = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
194		// = code below
195	0	double LLSQ::rms_orth(const FCOORD &dir) const {
196	0	FCOORD v = !dir;
197	0	v.normalise();
198	0	return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +
199	0	y_variance() * v.y() * v.y());
200	0	}
201
202		// Returns the direction of the fitted line as a unit vector, using the
203		// least mean squared perpendicular distance. The line runs through the
204		// mean_point, i.e. a point p on the line is given by:
205		// p = mean_point() + lambda * vector_fit() for some real number lambda.
206		// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
207		// and may be negated without changing its meaning.
208		// Fitting a line m + 𝜆v to a set of N points Pi = (xi, yi), where
209		// m is the mean point (𝝁, 𝝂) and
210		// v is the direction vector (cos𝜃, sin𝜃)
211		// The perpendicular distance of each Pi from the line is:
212		// (Pi - m) x v, where x is the scalar cross product.
213		// Total squared error is thus:
214		// E = ∑((xi - 𝝁)sin𝜃 - (yi - 𝝂)cos𝜃)²
215		// = ∑(xi - 𝝁)²sin²𝜃 - 2∑(xi - 𝝁)(yi - 𝝂)sin𝜃 cos𝜃 + ∑(yi - 𝝂)²cos²𝜃
216		// = NVar(xi)sin²𝜃 - 2NCovar(xi, yi)sin𝜃 cos𝜃 + NVar(yi)cos²𝜃 (Eq 1)
217		// where Var(xi) is the variance of xi,
218		// and Covar(xi, yi) is the covariance of xi, yi.
219		// Taking the derivative wrt 𝜃 and setting to 0 to obtain the min/max:
220		// 0 = 2NVar(xi)sin𝜃 cos𝜃 -2NCovar(xi, yi)(cos²𝜃 - sin²𝜃) -2NVar(yi)sin𝜃 cos𝜃
221		// => Covar(xi, yi)(cos²𝜃 - sin²𝜃) = (Var(xi) - Var(yi))sin𝜃 cos𝜃
222		// Using double angles:
223		// 2Covar(xi, yi)cos2𝜃 = (Var(xi) - Var(yi))sin2𝜃 (Eq 2)
224		// So 𝜃 = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
225
226		// Because it involves 2𝜃 , Eq 2 has 2 solutions 90 degrees apart, but which
227		// is the min and which is the max? From Eq1:
228		// E/N = Var(xi)sin²𝜃 - 2Covar(xi, yi)sin𝜃 cos𝜃 + Var(yi)cos²𝜃
229		// and 90 degrees away, using sin/cos equivalences:
230		// E'/N = Var(xi)cos²𝜃 + 2Covar(xi, yi)sin𝜃 cos𝜃 + Var(yi)sin²𝜃
231		// The second error is smaller (making it the minimum) iff
232		// E'/N < E/N ie:
233		// (Var(xi) - Var(yi))(cos²𝜃 - sin²𝜃) < -4Covar(xi, yi)sin𝜃 cos𝜃
234		// Using double angles:
235		// (Var(xi) - Var(yi))cos2𝜃 < -2Covar(xi, yi)sin2𝜃 (InEq 1)
236		// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2𝜃 such that:
237		// sgn(cos2𝜃) = sgn(Var(xi) - Var(yi)) and sgn(sin2𝜃) = sgn(Covar(xi, yi))
238		// so InEq1 can never be true, making the atan2 result always the min!
239		// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
240		// the 2 solutions have equal error and the inequality is still false.
241		// Therefore the solution really is as trivial as Eq 3.
242
243		// This is equivalent to returning the Principal Component in PCA, or the
244		// eigenvector corresponding to the largest eigenvalue in the covariance
245		// matrix. However, atan2 is much simpler! The one reference I found that
246		// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
247		// that is still a much more complex derivation. It seems Pearson had already
248		// found this simple solution in 1901.
249		// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
250	0	FCOORD LLSQ::vector_fit() const {
251	0	double x_var = x_variance();
252	0	double y_var = y_variance();
253	0	double covar = covariance();
254	0	double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
255	0	FCOORD result(cos(theta), sin(theta));
256	0	return result;
257	0	}
258
259		} // namespace tesseract