Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccutil/ambigs.h
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////
2
// File:        ambigs.h
3
// Description: Constants, flags, functions for dealing with
4
//              ambiguities (training and recognition).
5
// Author:      Daria Antonova
6
//
7
// (C) Copyright 2008, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
18
///////////////////////////////////////////////////////////////////////
19
20
#ifndef TESSERACT_CCUTIL_AMBIGS_H_
21
#define TESSERACT_CCUTIL_AMBIGS_H_
22
23
#ifdef HAVE_CONFIG_H
24
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
25
#endif
26
27
#if !defined(DISABLED_LEGACY_ENGINE)
28
29
#  include <tesseract/unichar.h>
30
#  include "elst.h"
31
#  include "tprintf.h"
32
#  include "unicharset.h"
33
34
152k
#  define MAX_AMBIG_SIZE 10
35
36
namespace tesseract {
37
38
using UnicharIdVector = std::vector<UNICHAR_ID>;
39
40
enum AmbigType {
41
  NOT_AMBIG,      // the ngram pair is not ambiguous
42
  REPLACE_AMBIG,  // ocred ngram should always be substituted with correct
43
  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
44
  SIMILAR_AMBIG,  // use pairwise classifier for ocred/correct pair (1-1)
45
  CASE_AMBIG,     // this is a case ambiguity (1-1)
46
47
  AMBIG_TYPE_COUNT // number of enum entries
48
};
49
50
// A collection of utility functions for arrays of UNICHAR_IDs that are
51
// terminated by INVALID_UNICHAR_ID.
52
class UnicharIdArrayUtils {
53
public:
54
  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
55
  // less than length of array2, if any array1[i] is less than array2[i].
56
  // Returns 0 if the arrays are equal, 1 otherwise.
57
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
58
593M
  static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
59
1.19G
    for (;;) {
60
1.19G
      const UNICHAR_ID val1 = *ptr1++;
61
1.19G
      const UNICHAR_ID val2 = *ptr2++;
62
1.19G
      if (val1 != val2) {
63
590M
        if (val1 == INVALID_UNICHAR_ID) {
64
5.67M
          return -1;
65
5.67M
        }
66
585M
        if (val2 == INVALID_UNICHAR_ID) {
67
116
          return 1;
68
116
        }
69
585M
        if (val1 < val2) {
70
14.4M
          return -1;
71
14.4M
        }
72
570M
        return 1;
73
585M
      }
74
600M
      if (val1 == INVALID_UNICHAR_ID) {
75
2.09M
        return 0;
76
2.09M
      }
77
600M
    }
78
593M
  }
79
80
  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
81
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
82
  // and that dst has enough space for all the elements from src.
83
76.1k
  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
84
76.1k
    int i = 0;
85
303k
    do {
86
303k
      dst[i] = src[i];
87
303k
    } while (dst[i++] != INVALID_UNICHAR_ID);
88
76.1k
    return i - 1;
89
76.1k
  }
90
91
  // Prints unichars corresponding to the unichar_ids in the given array.
92
  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
93
0
  static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {
94
0
    const UNICHAR_ID *ptr = array;
95
0
    if (*ptr == INVALID_UNICHAR_ID) {
96
0
      tprintf("[Empty]");
97
0
    }
98
0
    while (*ptr != INVALID_UNICHAR_ID) {
99
0
      tprintf("%s ", unicharset.id_to_unichar(*ptr++));
100
0
    }
101
0
    tprintf("( ");
102
0
    ptr = array;
103
0
    while (*ptr != INVALID_UNICHAR_ID) {
104
0
      tprintf("%d ", *ptr++);
105
0
    }
106
0
    tprintf(")\n");
107
0
  }
108
};
109
110
// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
111
// start with the same unichar (e.g. r->t rn->m rr1->m).
112
class AmbigSpec : public ELIST<AmbigSpec>::LINK {
113
public:
114
  AmbigSpec();
115
  ~AmbigSpec() = default;
116
117
  // Comparator function for sorting AmbigSpec_LISTs. The lists will
118
  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
119
  // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
120
10.5M
  static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) {
121
10.5M
    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
122
10.5M
    if (result != 0) {
123
10.5M
      return result;
124
10.5M
    }
125
436
    return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);
126
10.5M
  }
127
128
  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
129
  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
130
  UNICHAR_ID correct_ngram_id;
131
  AmbigType type;
132
  int wrong_ngram_size;
133
};
134
ELISTIZEH(AmbigSpec)
135
136
// AMBIG_TABLE[i] stores a set of ambiguities whose
137
// wrong ngram starts with unichar id i.
138
using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;
139
140
class UnicharAmbigs {
141
public:
142
8
  UnicharAmbigs() = default;
143
0
  ~UnicharAmbigs() {
144
0
    for (auto data : replace_ambigs_) {
145
0
      delete data;
146
0
    }
147
0
    for (auto data : dang_ambigs_) {
148
0
      delete data;
149
0
    }
150
0
    for (auto data : one_to_one_definite_ambigs_) {
151
0
      delete data;
152
0
    }
153
0
  }
154
155
817k
  const UnicharAmbigsVector &dang_ambigs() const {
156
817k
    return dang_ambigs_;
157
817k
  }
158
817k
  const UnicharAmbigsVector &replace_ambigs() const {
159
817k
    return replace_ambigs_;
160
817k
  }
161
162
  // Initializes the ambigs by adding a nullptr pointer to each table.
163
  void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);
164
165
  // Loads the universal ambigs that are useful for any language.
166
  void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);
167
168
  // Fills in two ambiguity tables (replaceable and dangerous) with information
169
  // read from the ambigs file. An ambiguity table is an array of lists.
170
  // The array is indexed by a class id. Each entry in the table provides
171
  // a list of potential ambiguities which can start with the corresponding
172
  // character. For example the ambiguity "rn -> m", would be located in the
173
  // table at index of unicharset.unichar_to_id('r').
174
  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
175
  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
176
  // of the wrong part of the ambiguity and each entry contains a vector of
177
  // unichar ids that are ambiguous to it.
178
  // encoder_set is used to encode the ambiguity strings, undisturbed by new
179
  // unichar_ids that may be created by adding the ambigs.
180
  void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,
181
                         bool use_ambigs_for_adaption, UNICHARSET *unicharset);
182
183
  // Returns definite 1-1 ambigs for the given unichar id.
184
0
  inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
185
0
    if (one_to_one_definite_ambigs_.empty()) {
186
0
      return nullptr;
187
0
    }
188
0
    return one_to_one_definite_ambigs_[unichar_id];
189
0
  }
190
191
  // Returns a pointer to the vector with all unichar ids that appear in the
192
  // 'correct' part of the ambiguity pair when the given unichar id appears
193
  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
194
  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
195
  // m will return a pointer to a vector with unichar ids of r,n,i.
196
0
  inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {
197
0
    if (ambigs_for_adaption_.empty()) {
198
0
      return nullptr;
199
0
    }
200
0
    return ambigs_for_adaption_[unichar_id];
201
0
  }
202
203
  // Similar to the above, but return the vector of unichar ids for which
204
  // the given unichar_id is an ambiguity (appears in the 'wrong' part of
205
  // some ambiguity pair).
206
97
  inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {
207
97
    if (reverse_ambigs_for_adaption_.empty()) {
208
97
      return nullptr;
209
97
    }
210
0
    return reverse_ambigs_for_adaption_[unichar_id];
211
97
  }
212
213
private:
214
  bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,
215
                          char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
216
                          int *replacement_ambig_part_size, char *replacement_string, int *type);
217
  bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,
218
                       UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
219
                       const char *replacement_string, int type, AmbigSpec *ambig_spec,
220
                       UNICHARSET *unicharset);
221
222
  UnicharAmbigsVector dang_ambigs_;
223
  UnicharAmbigsVector replace_ambigs_;
224
  std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;
225
  std::vector<UnicharIdVector *> ambigs_for_adaption_;
226
  std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;
227
};
228
229
} // namespace tesseract
230
231
#endif // !defined(DISABLED_LEGACY_ENGINE)
232
233
#endif // TESSERACT_CCUTIL_AMBIGS_H_