Coverage Report

Created: 2025-09-27 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/wordrec/lm_pain_points.cpp
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////
2
// File:        pain_points.cpp
3
// Description: Functions that utilize the knowledge about the properties
4
//              of the paths explored by the segmentation search in order
5
//              to "pain points" - the locations in the ratings matrix
6
//              which should be classified next.
7
// Author:      Rika Antonova
8
// Created:     Mon Jun 20 11:26:43 PST 2012
9
//
10
// (C) Copyright 2012, Google Inc.
11
// Licensed under the Apache License, Version 2.0 (the "License");
12
// you may not use this file except in compliance with the License.
13
// You may obtain a copy of the License at
14
// http://www.apache.org/licenses/LICENSE-2.0
15
// Unless required by applicable law or agreed to in writing, software
16
// distributed under the License is distributed on an "AS IS" BASIS,
17
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
// See the License for the specific language governing permissions and
19
// limitations under the License.
20
//
21
///////////////////////////////////////////////////////////////////////
22
23
#include "lm_pain_points.h"
24
25
#include "associate.h"
26
#include "dict.h"
27
#include "genericheap.h"
28
#include "lm_state.h"
29
#include "matrix.h"
30
#include "pageres.h"
31
32
#include <algorithm>
33
34
namespace tesseract {
35
36
const float LMPainPoints::kDefaultPainPointPriorityAdjustment = 2.0f;
37
const float LMPainPoints::kLooseMaxCharWhRatio = 2.5f;
38
39
2.21M
LMPainPointsType LMPainPoints::Deque(MATRIX_COORD *pp, float *priority) {
40
7.38M
  for (int h = 0; h < LM_PPTYPE_NUM; ++h) {
41
7.27M
    if (pain_points_heaps_[h].empty()) {
42
5.17M
      continue;
43
5.17M
    }
44
2.09M
    *priority = pain_points_heaps_[h].PeekTop().key();
45
2.09M
    *pp = pain_points_heaps_[h].PeekTop().data();
46
2.09M
    pain_points_heaps_[h].Pop(nullptr);
47
2.09M
    return static_cast<LMPainPointsType>(h);
48
7.27M
  }
49
113k
  return LM_PPTYPE_NUM;
50
2.21M
}
51
52
145k
void LMPainPoints::GenerateInitial(WERD_RES *word_res) {
53
145k
  MATRIX *ratings = word_res->ratings;
54
145k
  AssociateStats associate_stats;
55
748k
  for (int col = 0; col < ratings->dimension(); ++col) {
56
603k
    int row_end = std::min(ratings->dimension(), col + ratings->bandwidth() + 1);
57
2.07M
    for (int row = col + 1; row < row_end; ++row) {
58
1.46M
      MATRIX_COORD coord(col, row);
59
1.46M
      if (coord.Valid(*ratings) && ratings->get(col, row) != NOT_CLASSIFIED) {
60
0
        continue;
61
0
      }
62
      // Add an initial pain point if needed.
63
1.46M
      if (ratings->Classified(col, row - 1, dict_->WildcardID()) ||
64
1.00M
          (col + 1 < ratings->dimension() &&
65
1.00M
           ratings->Classified(col + 1, row, dict_->WildcardID()))) {
66
457k
        GeneratePainPoint(col, row, LM_PPTYPE_SHAPE, 0.0, true, max_char_wh_ratio_, word_res);
67
457k
      }
68
1.46M
    }
69
603k
  }
70
145k
}
71
72
void LMPainPoints::GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,
73
355k
                                    WERD_RES *word_res) {
74
355k
  ViterbiStateEntry *curr_vse = vse;
75
355k
  BLOB_CHOICE *curr_b = vse->curr_b;
76
  // The following pain point generation and priority calculation approaches
77
  // prioritize exploring paths with low average rating of the known part of
78
  // the path, while not relying on the ratings of the pieces to be combined.
79
  //
80
  // A pain point to combine the neighbors is generated for each pair of
81
  // neighboring blobs on the path (the path is represented by vse argument
82
  // given to GenerateFromPath()). The priority of each pain point is set to
83
  // the average rating (per outline length) of the path, not including the
84
  // ratings of the blobs to be combined.
85
  // The ratings of the blobs to be combined are not used to calculate the
86
  // priority, since it is not possible to determine from their magnitude
87
  // whether it will be beneficial to combine the blobs. The reason is that
88
  // chopped junk blobs (/ | - ') can have very good (low) ratings, however
89
  // combining them will be beneficial. Blobs with high ratings might be
90
  // over-joined pieces of characters, but also could be blobs from an unseen
91
  // font or chopped pieces of complex characters.
92
2.82M
  while (curr_vse->parent_vse != nullptr) {
93
2.47M
    ViterbiStateEntry *parent_vse = curr_vse->parent_vse;
94
2.47M
    const MATRIX_COORD &curr_cell = curr_b->matrix_cell();
95
2.47M
    const MATRIX_COORD &parent_cell = parent_vse->curr_b->matrix_cell();
96
2.47M
    MATRIX_COORD pain_coord(parent_cell.col, curr_cell.row);
97
2.47M
    if (!pain_coord.Valid(*word_res->ratings) ||
98
2.38M
        !word_res->ratings->Classified(parent_cell.col, curr_cell.row, dict_->WildcardID())) {
99
      // rat_subtr contains ratings sum of the two adjacent blobs to be merged.
100
      // rat_subtr will be subtracted from the ratings sum of the path, since
101
      // the blobs will be joined into a new blob, whose rating is yet unknown.
102
2.02M
      float rat_subtr = curr_b->rating() + parent_vse->curr_b->rating();
103
      // ol_subtr contains the outline length of the blobs that will be joined.
104
2.02M
      float ol_subtr =
105
2.02M
          AssociateUtils::ComputeOutlineLength(rating_cert_scale, *curr_b) +
106
2.02M
          AssociateUtils::ComputeOutlineLength(rating_cert_scale, *(parent_vse->curr_b));
107
      // ol_dif is the outline of the path without the two blobs to be joined.
108
2.02M
      float ol_dif = vse->outline_length - ol_subtr;
109
      // priority is set to the average rating of the path per unit of outline,
110
      // not counting the ratings of the pieces to be joined.
111
2.02M
      float priority = ol_dif > 0 ? (vse->ratings_sum - rat_subtr) / ol_dif : 0.0;
112
2.02M
      GeneratePainPoint(pain_coord.col, pain_coord.row, LM_PPTYPE_PATH, priority, true,
113
2.02M
                        max_char_wh_ratio_, word_res);
114
2.02M
    } else if (debug_level_ > 3) {
115
0
      tprintf("NO pain point (Classified) for col=%d row=%d type=%s\n", pain_coord.col,
116
0
              pain_coord.row, LMPainPointsTypeName[LM_PPTYPE_PATH]);
117
0
      BLOB_CHOICE_IT b_it(word_res->ratings->get(pain_coord.col, pain_coord.row));
118
0
      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
119
0
        BLOB_CHOICE *choice = b_it.data();
120
0
        choice->print_full();
121
0
      }
122
0
    }
123
124
2.47M
    curr_vse = parent_vse;
125
2.47M
    curr_b = curr_vse->curr_b;
126
2.47M
  }
127
355k
}
128
129
void LMPainPoints::GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,
130
249k
                                      WERD_RES *word_res) {
131
  // Begins and ends in DANGERR vector now record the blob indices as used
132
  // by the ratings matrix.
133
11.5M
  for (auto &&danger : fixpt) {
134
    // Only use dangerous ambiguities.
135
11.5M
    if (danger.dangerous) {
136
1.20M
      GeneratePainPoint(danger.begin, danger.end - 1, LM_PPTYPE_AMBIG, vse->cost, true,
137
1.20M
                        kLooseMaxCharWhRatio, word_res);
138
1.20M
    }
139
11.5M
  }
140
249k
}
141
142
bool LMPainPoints::GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
143
                                     float special_priority, bool ok_to_extend,
144
5.92M
                                     float max_char_wh_ratio, WERD_RES *word_res) {
145
5.92M
  MATRIX_COORD coord(col, row);
146
5.92M
  if (coord.Valid(*word_res->ratings) &&
147
5.59M
      word_res->ratings->Classified(col, row, dict_->WildcardID())) {
148
1.63M
    return false;
149
1.63M
  }
150
4.29M
  if (debug_level_ > 3) {
151
0
    tprintf("Generating pain point for col=%d row=%d type=%s\n", col, row,
152
0
            LMPainPointsTypeName[pp_type]);
153
0
  }
154
  // Compute associate stats.
155
4.29M
  AssociateStats associate_stats;
156
4.29M
  AssociateUtils::ComputeStats(col, row, nullptr, 0, fixed_pitch_, max_char_wh_ratio, word_res,
157
4.29M
                               debug_level_, &associate_stats);
158
  // For fixed-pitch fonts/languages: if the current combined blob overlaps
159
  // the next blob on the right and it is ok to extend the blob, try extending
160
  // the blob until there is no overlap with the next blob on the right or
161
  // until the width-to-height ratio becomes too large.
162
4.29M
  if (ok_to_extend) {
163
4.29M
    while (associate_stats.bad_fixed_pitch_right_gap && row + 1 < word_res->ratings->dimension() &&
164
0
           !associate_stats.bad_fixed_pitch_wh_ratio) {
165
0
      AssociateUtils::ComputeStats(col, ++row, nullptr, 0, fixed_pitch_, max_char_wh_ratio,
166
0
                                   word_res, debug_level_, &associate_stats);
167
0
    }
168
4.29M
  }
169
4.29M
  if (associate_stats.bad_shape) {
170
273k
    if (debug_level_ > 3) {
171
0
      tprintf("Discarded pain point with a bad shape\n");
172
0
    }
173
273k
    return false;
174
273k
  }
175
176
  // Insert the new pain point into pain_points_heap_.
177
4.02M
  if (pain_points_heaps_[pp_type].size() < max_heap_size_) {
178
    // Compute pain point priority.
179
4.02M
    float priority;
180
4.02M
    if (pp_type == LM_PPTYPE_PATH) {
181
1.93M
      priority = special_priority;
182
2.08M
    } else {
183
2.08M
      priority = associate_stats.gap_sum;
184
2.08M
    }
185
4.02M
    MatrixCoordPair pain_point(priority, MATRIX_COORD(col, row));
186
4.02M
    pain_points_heaps_[pp_type].Push(&pain_point);
187
4.02M
    if (debug_level_) {
188
0
      tprintf("Added pain point with priority %g\n", priority);
189
0
    }
190
4.02M
    return true;
191
4.02M
  } else {
192
0
    if (debug_level_) {
193
0
      tprintf("Pain points heap is full\n");
194
0
    }
195
0
    return false;
196
0
  }
197
4.02M
}
198
199
/**
200
 * Adjusts the pain point coordinates to cope with expansion of the ratings
201
 * matrix due to a split of the blob with the given index.
202
 */
203
192k
void LMPainPoints::RemapForSplit(int index) {
204
770k
  for (auto &pain_points_heap : pain_points_heaps_) {
205
770k
    std::vector<MatrixCoordPair> &heap = pain_points_heap.heap();
206
6.88M
    for (auto &&entry : heap) {
207
6.88M
      entry.data().MapForSplit(index);
208
6.88M
    }
209
770k
  }
210
192k
}
211
212
} //  namespace tesseract