Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccstruct/detlinefit.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        detlinefit.cpp
3
// Description: Deterministic least median squares line fitting.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2008, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#include "detlinefit.h"
20
#include "helpers.h"        // for IntCastRounded
21
#include "statistc.h"
22
#include "tesserrstream.h"  // for tesserr
23
24
#include <algorithm>
25
#include <cfloat> // for FLT_MAX
26
27
namespace tesseract {
28
29
// The number of points to consider at each end.
30
const int kNumEndPoints = 3;
31
// The minimum number of points at which to switch to number of points
32
// for badly fitted lines.
33
// To ensure a sensible error metric, kMinPointsForErrorCount should be at
34
// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
35
// ComputeUpperQuartileError.
36
const int kMinPointsForErrorCount = 16;
37
// The maximum real distance to use before switching to number of
38
// mis-fitted points, which will get square-rooted for true distance.
39
const int kMaxRealDistance = 2.0;
40
41
476k
DetLineFit::DetLineFit() : square_length_(0.0) {}
42
43
// Delete all Added points.
44
122k
void DetLineFit::Clear() {
45
122k
  pts_.clear();
46
122k
  distances_.clear();
47
122k
}
48
49
// Add a new point. Takes a copy - the pt doesn't need to stay in scope.
50
2.28M
void DetLineFit::Add(const ICOORD &pt) {
51
2.28M
  pts_.emplace_back(pt, 0);
52
2.28M
}
53
// Associates a half-width with the given point if a point overlaps the
54
// previous point by more than half the width, and its distance is further
55
// than the previous point, then the more distant point is ignored in the
56
// distance calculation. Useful for ignoring i dots and other diacritics.
57
1.88M
void DetLineFit::Add(const ICOORD &pt, int halfwidth) {
58
1.88M
  pts_.emplace_back(pt, halfwidth);
59
1.88M
}
60
61
// Fits a line to the points, ignoring the skip_first initial points and the
62
// skip_last final points, returning the fitted line as a pair of points,
63
// and the upper quartile error.
64
280k
double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {
65
  // Do something sensible with no points.
66
280k
  if (pts_.empty()) {
67
0
    pt1->set_x(0);
68
0
    pt1->set_y(0);
69
0
    *pt2 = *pt1;
70
0
    return 0.0;
71
0
  }
72
  // Count the points and find the first and last kNumEndPoints.
73
280k
  int pt_count = pts_.size();
74
280k
  ICOORD *starts[kNumEndPoints];
75
280k
  if (skip_first >= pt_count) {
76
0
    skip_first = pt_count - 1;
77
0
  }
78
280k
  int start_count = 0;
79
280k
  int end_i = std::min(skip_first + kNumEndPoints, pt_count);
80
906k
  for (int i = skip_first; i < end_i; ++i) {
81
626k
    starts[start_count++] = &pts_[i].pt;
82
626k
  }
83
280k
  ICOORD *ends[kNumEndPoints];
84
280k
  if (skip_last >= pt_count) {
85
0
    skip_last = pt_count - 1;
86
0
  }
87
280k
  int end_count = 0;
88
280k
  end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
89
906k
  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
90
626k
    ends[end_count++] = &pts_[i].pt;
91
626k
  }
92
  // 1 or 2 points need special treatment.
93
280k
  if (pt_count <= 2) {
94
122k
    *pt1 = *starts[0];
95
122k
    if (pt_count > 1) {
96
31.1k
      *pt2 = *ends[0];
97
91.6k
    } else {
98
91.6k
      *pt2 = *pt1;
99
91.6k
    }
100
122k
    return 0.0;
101
122k
  }
102
  // Although with between 2 and 2*kNumEndPoints-1 points, there will be
103
  // overlap in the starts, ends sets, this is OK and taken care of by the
104
  // if (*start != *end) test below, which also tests for equal input points.
105
157k
  double best_uq = -1.0;
106
  // Iterate each pair of points and find the best fitting line.
107
629k
  for (int i = 0; i < start_count; ++i) {
108
472k
    ICOORD *start = starts[i];
109
1.88M
    for (int j = 0; j < end_count; ++j) {
110
1.41M
      ICOORD *end = ends[j];
111
1.41M
      if (*start != *end) {
112
1.31M
        ComputeDistances(*start, *end);
113
        // Compute the upper quartile error from the line.
114
1.31M
        double dist = EvaluateLineFit();
115
1.31M
        if (dist < best_uq || best_uq < 0.0) {
116
330k
          best_uq = dist;
117
330k
          *pt1 = *start;
118
330k
          *pt2 = *end;
119
330k
        }
120
1.31M
      }
121
1.41M
    }
122
472k
  }
123
  // Finally compute the square root to return the true distance.
124
157k
  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
125
280k
}
126
127
// Constrained fit with a supplied direction vector. Finds the best line_pt,
128
// that is one of the supplied points having the median cross product with
129
// direction, ignoring points that have a cross product outside of the range
130
// [min_dist, max_dist]. Returns the resulting error metric using the same
131
// reduced set of points.
132
// *Makes use of floating point arithmetic*
133
double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,
134
473k
                                  bool debug, ICOORD *line_pt) {
135
473k
  ComputeConstrainedDistances(direction, min_dist, max_dist);
136
  // Do something sensible with no points or computed distances.
137
473k
  if (pts_.empty() || distances_.empty()) {
138
0
    line_pt->set_x(0);
139
0
    line_pt->set_y(0);
140
0
    return 0.0;
141
0
  }
142
473k
  auto median_index = distances_.size() / 2;
143
473k
  std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());
144
473k
  *line_pt = distances_[median_index].data();
145
473k
  if (debug) {
146
0
    tesserr << "Constrained fit to dir " << direction.x() << ", "
147
0
            << direction.y() << " = "
148
0
            << line_pt->x() << ", " << line_pt->y()
149
0
            << " :" << distances_.size() << " distances:\n";
150
0
    for (unsigned i = 0; i < distances_.size(); ++i) {
151
0
      tesserr << i << ": "
152
0
              << distances_[i].data().x() << ", "
153
0
              << distances_[i].data().y() << " -> "
154
0
              << distances_[i].key() << '\n';
155
0
    }
156
0
    tesserr << "Result = " << median_index << '\n';
157
0
  }
158
  // Center distances on the fitted point.
159
473k
  double dist_origin = direction * *line_pt;
160
3.22M
  for (auto &distance : distances_) {
161
3.22M
    distance.key() -= dist_origin;
162
3.22M
  }
163
473k
  return sqrt(EvaluateLineFit());
164
473k
}
165
166
// Returns true if there were enough points at the last call to Fit or
167
// ConstrainedFit for the fitted points to be used on a badly fitted line.
168
253k
bool DetLineFit::SufficientPointsForIndependentFit() const {
169
253k
  return distances_.size() >= kMinPointsForErrorCount;
170
253k
}
171
172
// Backwards compatible fit returning a gradient and constant.
173
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
174
// function in preference to the LMS class.
175
135k
double DetLineFit::Fit(float *m, float *c) {
176
135k
  ICOORD start, end;
177
135k
  double error = Fit(&start, &end);
178
135k
  if (end.x() != start.x()) {
179
71.6k
    *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
180
71.6k
    *c = start.y() - *m * start.x();
181
71.6k
  } else {
182
63.7k
    *m = 0.0f;
183
63.7k
    *c = 0.0f;
184
63.7k
  }
185
135k
  return error;
186
135k
}
187
188
// Backwards compatible constrained fit with a supplied gradient.
189
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
190
// to avoid potential difficulties with infinite gradients.
191
218k
double DetLineFit::ConstrainedFit(double m, float *c) {
192
  // Do something sensible with no points.
193
218k
  if (pts_.empty()) {
194
0
    *c = 0.0f;
195
0
    return 0.0;
196
0
  }
197
218k
  double cos = 1.0 / sqrt(1.0 + m * m);
198
218k
  FCOORD direction(cos, m * cos);
199
218k
  ICOORD line_pt;
200
218k
  double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
201
218k
  *c = line_pt.y() - line_pt.x() * m;
202
218k
  return error;
203
218k
}
204
205
// Computes and returns the squared evaluation metric for a line fit.
206
1.78M
double DetLineFit::EvaluateLineFit() {
207
  // Compute the upper quartile error from the line.
208
1.78M
  double dist = ComputeUpperQuartileError();
209
1.78M
  if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {
210
    // Use the number of mis-fitted points as the error metric, as this
211
    // gives a better measure of fit for badly fitted lines where more
212
    // than a quarter are badly fitted.
213
392k
    double threshold = kMaxRealDistance * sqrt(square_length_);
214
392k
    dist = NumberOfMisfittedPoints(threshold);
215
392k
  }
216
1.78M
  return dist;
217
1.78M
}
218
219
// Computes the absolute error distances of the points from the line,
220
// and returns the squared upper-quartile error distance.
221
1.78M
double DetLineFit::ComputeUpperQuartileError() {
222
1.78M
  int num_errors = distances_.size();
223
1.78M
  if (num_errors == 0) {
224
0
    return 0.0;
225
0
  }
226
  // Get the absolute values of the errors.
227
35.9M
  for (int i = 0; i < num_errors; ++i) {
228
34.1M
    if (distances_[i].key() < 0) {
229
12.7M
      distances_[i].key() = -distances_[i].key();
230
12.7M
    }
231
34.1M
  }
232
  // Now get the upper quartile distance.
233
1.78M
  auto index = 3 * num_errors / 4;
234
1.78M
  std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());
235
1.78M
  double dist = distances_[index].key();
236
  // The true distance is the square root of the dist squared / square_length.
237
  // Don't bother with the square root. Just return the square distance.
238
1.78M
  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
239
1.78M
}
240
241
// Returns the number of sample points that have an error more than threshold.
242
392k
int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
243
392k
  int num_misfits = 0;
244
392k
  int num_dists = distances_.size();
245
  // Get the absolute values of the errors.
246
20.5M
  for (int i = 0; i < num_dists; ++i) {
247
20.2M
    if (distances_[i].key() > threshold) {
248
13.5M
      ++num_misfits;
249
13.5M
    }
250
20.2M
  }
251
392k
  return num_misfits;
252
392k
}
253
254
// Computes all the cross product distances of the points from the line,
255
// storing the actual (signed) cross products in distances.
256
// Ignores distances of points that are further away than the previous point,
257
// and overlaps the previous point by at least half.
258
1.31M
void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
259
1.31M
  distances_.clear();
260
1.31M
  ICOORD line_vector = end;
261
1.31M
  line_vector -= start;
262
1.31M
  square_length_ = line_vector.sqlength();
263
1.31M
  int line_length = IntCastRounded(sqrt(square_length_));
264
  // Compute the distance of each point from the line.
265
1.31M
  int prev_abs_dist = 0;
266
1.31M
  int prev_dot = 0;
267
35.5M
  for (unsigned i = 0; i < pts_.size(); ++i) {
268
34.2M
    ICOORD pt_vector = pts_[i].pt;
269
34.2M
    pt_vector -= start;
270
34.2M
    int dot = line_vector % pt_vector;
271
    // Compute |line_vector||pt_vector|sin(angle between)
272
34.2M
    int dist = line_vector * pt_vector;
273
34.2M
    int abs_dist = dist < 0 ? -dist : dist;
274
34.2M
    if (abs_dist > prev_abs_dist && i > 0) {
275
      // Ignore this point if it overlaps the previous one.
276
15.6M
      int separation = abs(dot - prev_dot);
277
15.6M
      if (separation < line_length * pts_[i].halfwidth ||
278
15.6M
          separation < line_length * pts_[i - 1].halfwidth) {
279
3.32M
        continue;
280
3.32M
      }
281
15.6M
    }
282
30.9M
    distances_.emplace_back(dist, pts_[i].pt);
283
30.9M
    prev_abs_dist = abs_dist;
284
30.9M
    prev_dot = dot;
285
30.9M
  }
286
1.31M
}
287
288
// Computes all the cross product distances of the points perpendicular to
289
// the given direction, ignoring distances outside of the give distance range,
290
// storing the actual (signed) cross products in distances_.
291
void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,
292
473k
                                             double max_dist) {
293
473k
  distances_.clear();
294
473k
  square_length_ = direction.sqlength();
295
  // Compute the distance of each point from the line.
296
5.41M
  for (auto &pt : pts_) {
297
5.41M
    FCOORD pt_vector = pt.pt;
298
    // Compute |line_vector||pt_vector|sin(angle between)
299
5.41M
    double dist = direction * pt_vector;
300
5.41M
    if (min_dist <= dist && dist <= max_dist) {
301
3.22M
      distances_.emplace_back(dist, pt.pt);
302
3.22M
    }
303
5.41M
  }
304
473k
}
305
306
} // namespace tesseract.