Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/classify/normmatch.cpp
Line
Count
Source
1
/******************************************************************************
2
 ** Filename:    normmatch.c
3
 ** Purpose:     Simple matcher based on character normalization features.
4
 ** Author:      Dan Johnson
5
 **
6
 ** (c) Copyright Hewlett-Packard Company, 1988.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 ******************************************************************************/
17
/*----------------------------------------------------------------------------
18
          Include Files and Type Defines
19
----------------------------------------------------------------------------*/
20
#include "normmatch.h"
21
22
#include "classify.h"
23
#include "clusttool.h"
24
#include "helpers.h"
25
#include "normfeat.h"
26
#include "params.h"
27
#include "unicharset.h"
28
29
#include <cmath>
30
#include <cstdio>
31
#include <sstream> // for std::istringstream
32
33
namespace tesseract {
34
35
struct NORM_PROTOS {
36
4
  NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
37
4
  }
38
  int NumParams = 0;
39
  int NumProtos;
40
  PARAM_DESC *ParamDesc = nullptr;
41
  std::vector<LIST> Protos;
42
};
43
44
/*----------------------------------------------------------------------------
45
              Private Code
46
----------------------------------------------------------------------------*/
47
48
/**
49
 * @name NormEvidenceOf
50
 *
51
 * Return the new type of evidence number corresponding to this
52
 * normalization adjustment.  The equation that represents the transform is:
53
 *       1 / (1 + (NormAdj / midpoint) ^ curl)
54
 */
55
205M
static float NormEvidenceOf(float NormAdj) {
56
205M
  NormAdj /= static_cast<float>(classify_norm_adj_midpoint);
57
58
205M
  if (classify_norm_adj_curl == 3) {
59
0
    NormAdj = NormAdj * NormAdj * NormAdj;
60
205M
  } else if (classify_norm_adj_curl == 2) {
61
205M
    NormAdj = NormAdj * NormAdj;
62
205M
  } else {
63
0
    NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl));
64
0
  }
65
205M
  return (1 / (1 + NormAdj));
66
205M
}
67
68
/*----------------------------------------------------------------------------
69
        Variables
70
----------------------------------------------------------------------------*/
71
72
/** control knobs used to control the normalization adjustment process */
73
double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
74
double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
75
/** Weight of width variance against height and vertical position. */
76
const float kWidthErrorWeighting = 0.125f;
77
78
/*----------------------------------------------------------------------------
79
              Public Code
80
----------------------------------------------------------------------------*/
81
/**
82
 * This routine compares Features against each character
83
 * normalization proto for ClassId and returns the match
84
 * rating of the best match.
85
 * @param ClassId id of class to match against
86
 * @param feature character normalization feature
87
 * @param DebugMatch controls dump of debug info
88
 *
89
 * Globals:
90
 * #NormProtos character normalization prototypes
91
 *
92
 * @return Best match rating for Feature against protos of ClassId.
93
 */
94
209M
float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
95
209M
  if (ClassId >= NormProtos->NumProtos) {
96
0
    ClassId = NO_CLASS;
97
0
  }
98
99
  /* handle requests for classification as noise */
100
209M
  if (ClassId == NO_CLASS) {
101
    /* kludge - clean up constants and make into control knobs later */
102
1.85M
    float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
103
1.85M
                   feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
104
1.85M
                   feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
105
1.85M
    return (1 - NormEvidenceOf(Match));
106
1.85M
  }
107
108
207M
  if (DebugMatch) {
109
0
    tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
110
0
  }
111
112
207M
  LIST Protos = NormProtos->Protos[ClassId];
113
207M
  if (Protos == nullptr) {
114
     // Avoid FP overflow in NormEvidenceOf.
115
3.70M
     return 1.0f;
116
3.70M
  }
117
118
203M
  float BestMatch = FLT_MAX;
119
203M
  iterate(Protos) {
120
203M
    auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
121
203M
    float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
122
203M
    float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
123
203M
    if (DebugMatch) {
124
0
      tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
125
0
              Proto->Weight.Elliptical[CharNormY], Match);
126
0
    }
127
203M
    Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
128
203M
    Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
129
203M
    if (DebugMatch) {
130
0
      tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
131
0
              Proto->Weight.Elliptical[CharNormRx], Match);
132
0
    }
133
    // Ry is width! See intfx.cpp.
134
203M
    Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
135
203M
    if (DebugMatch) {
136
0
      tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
137
0
              Proto->Weight.Elliptical[CharNormRy]);
138
0
    }
139
203M
    Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
140
203M
    Delta *= kWidthErrorWeighting;
141
203M
    Match += Delta;
142
203M
    if (DebugMatch) {
143
0
      tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
144
0
              Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
145
0
              256 * (1 - NormEvidenceOf(Match)));
146
0
    }
147
148
203M
    if (Match < BestMatch) {
149
203M
      BestMatch = Match;
150
203M
    }
151
203M
  }
152
203M
  return 1 - NormEvidenceOf(BestMatch);
153
207M
} /* ComputeNormMatch */
154
155
0
void Classify::FreeNormProtos() {
156
0
  if (NormProtos != nullptr) {
157
0
    for (int i = 0; i < NormProtos->NumProtos; i++) {
158
0
      FreeProtoList(&NormProtos->Protos[i]);
159
0
    }
160
0
    delete[] NormProtos->ParamDesc;
161
0
    delete NormProtos;
162
0
    NormProtos = nullptr;
163
0
  }
164
0
}
165
166
/**
167
 * This routine allocates a new data structure to hold
168
 * a set of character normalization protos.  It then fills in
169
 * the data structure by reading from the specified File.
170
 * @param fp open text file to read normalization protos from
171
 * Globals: none
172
 * @return Character normalization protos.
173
 */
174
4
NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
175
4
  char unichar[2 * UNICHAR_LEN + 1];
176
4
  UNICHAR_ID unichar_id;
177
4
  LIST Protos;
178
4
  int NumProtos;
179
180
  /* allocate and initialization data structure */
181
4
  auto NormProtos = new NORM_PROTOS(unicharset.size());
182
183
  /* read file header and save in data structure */
184
4
  NormProtos->NumParams = ReadSampleSize(fp);
185
4
  NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
186
187
  /* read protos for each class into a separate list */
188
4
  const int kMaxLineSize = 100;
189
4
  char line[kMaxLineSize];
190
884
  while (fp->FGets(line, kMaxLineSize) != nullptr) {
191
880
    std::istringstream stream(line);
192
880
    stream.imbue(std::locale::classic());
193
880
    stream >> unichar >> NumProtos;
194
880
    if (stream.fail()) {
195
440
      continue;
196
440
    }
197
440
    if (unicharset.contains_unichar(unichar)) {
198
440
      unichar_id = unicharset.unichar_to_id(unichar);
199
440
      Protos = NormProtos->Protos[unichar_id];
200
880
      for (int i = 0; i < NumProtos; i++) {
201
440
        Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
202
440
      }
203
440
      NormProtos->Protos[unichar_id] = Protos;
204
440
    } else {
205
0
      tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
206
0
      for (int i = 0; i < NumProtos; i++) {
207
0
        FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
208
0
      }
209
0
    }
210
440
  }
211
4
  return NormProtos;
212
4
} /* ReadNormProtos */
213
214
} // namespace tesseract