/src/tesseract/src/ccutil/ambigs.h

Source
///////////////////////////////////////////////////////////////////////
// File:        ambigs.h
// Description: Constants, flags, functions for dealing with
//              ambiguities (training and recognition).
// Author:      Daria Antonova
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCUTIL_AMBIGS_H_
#define TESSERACT_CCUTIL_AMBIGS_H_

#ifdef HAVE_CONFIG_H
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
#endif

#if !defined(DISABLED_LEGACY_ENGINE)

#  include <tesseract/unichar.h>
#  include "elst.h"
#  include "tprintf.h"
#  include "unicharset.h"

#  define MAX_AMBIG_SIZE 10

namespace tesseract {

using UnicharIdVector = std::vector<UNICHAR_ID>;

enum AmbigType {
  NOT_AMBIG,      // the ngram pair is not ambiguous
  REPLACE_AMBIG,  // ocred ngram should always be substituted with correct
  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
  SIMILAR_AMBIG,  // use pairwise classifier for ocred/correct pair (1-1)
  CASE_AMBIG,     // this is a case ambiguity (1-1)

  AMBIG_TYPE_COUNT // number of enum entries
};

// A collection of utility functions for arrays of UNICHAR_IDs that are
// terminated by INVALID_UNICHAR_ID.
class UnicharIdArrayUtils {
public:
  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
  // less than length of array2, if any array1[i] is less than array2[i].
  // Returns 0 if the arrays are equal, 1 otherwise.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
  static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
    for (;;) {
      const UNICHAR_ID val1 = *ptr1++;
      const UNICHAR_ID val2 = *ptr2++;
      if (val1 != val2) {
        if (val1 == INVALID_UNICHAR_ID) {
          return -1;
        }
        if (val2 == INVALID_UNICHAR_ID) {
          return 1;
        }
        if (val1 < val2) {
          return -1;
        }
        return 1;
      }
      if (val1 == INVALID_UNICHAR_ID) {
        return 0;
      }
    }
  }

  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
  // and that dst has enough space for all the elements from src.
  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
    int i = 0;
    do {
      dst[i] = src[i];
    } while (dst[i++] != INVALID_UNICHAR_ID);
    return i - 1;
  }

  // Prints unichars corresponding to the unichar_ids in the given array.
  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
  static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {
    const UNICHAR_ID *ptr = array;
    if (*ptr == INVALID_UNICHAR_ID) {
      tprintf("[Empty]");
    }
    while (*ptr != INVALID_UNICHAR_ID) {
      tprintf("%s ", unicharset.id_to_unichar(*ptr++));
    }
    tprintf("( ");
    ptr = array;
    while (*ptr != INVALID_UNICHAR_ID) {
      tprintf("%d ", *ptr++);
    }
    tprintf(")\n");
  }
};

// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
// start with the same unichar (e.g. r->t rn->m rr1->m).
class AmbigSpec : public ELIST<AmbigSpec>::LINK {
public:
  AmbigSpec();
  ~AmbigSpec() = default;

  // Comparator function for sorting AmbigSpec_LISTs. The lists will
  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
  // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
  static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) {
    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
    if (result != 0) {
      return result;
    }
    return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);
  }

  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
  UNICHAR_ID correct_ngram_id;
  AmbigType type;
  int wrong_ngram_size;
};
ELISTIZEH(AmbigSpec)

// AMBIG_TABLE[i] stores a set of ambiguities whose
// wrong ngram starts with unichar id i.
using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;

class UnicharAmbigs {
public:
  UnicharAmbigs() = default;
  ~UnicharAmbigs() {
    for (auto data : replace_ambigs_) {
      delete data;
    }
    for (auto data : dang_ambigs_) {
      delete data;
    }
    for (auto data : one_to_one_definite_ambigs_) {
      delete data;
    }
  }

  const UnicharAmbigsVector &dang_ambigs() const {
    return dang_ambigs_;
  }
  const UnicharAmbigsVector &replace_ambigs() const {
    return replace_ambigs_;
  }

  // Initializes the ambigs by adding a nullptr pointer to each table.
  void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);

  // Loads the universal ambigs that are useful for any language.
  void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);

  // Fills in two ambiguity tables (replaceable and dangerous) with information
  // read from the ambigs file. An ambiguity table is an array of lists.
  // The array is indexed by a class id. Each entry in the table provides
  // a list of potential ambiguities which can start with the corresponding
  // character. For example the ambiguity "rn -> m", would be located in the
  // table at index of unicharset.unichar_to_id('r').
  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
  // of the wrong part of the ambiguity and each entry contains a vector of
  // unichar ids that are ambiguous to it.
  // encoder_set is used to encode the ambiguity strings, undisturbed by new
  // unichar_ids that may be created by adding the ambigs.
  void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,
                         bool use_ambigs_for_adaption, UNICHARSET *unicharset);

  // Returns definite 1-1 ambigs for the given unichar id.
  inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
    if (one_to_one_definite_ambigs_.empty()) {
      return nullptr;
    }
    return one_to_one_definite_ambigs_[unichar_id];
  }

  // Returns a pointer to the vector with all unichar ids that appear in the
  // 'correct' part of the ambiguity pair when the given unichar id appears
  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
  // m will return a pointer to a vector with unichar ids of r,n,i.
  inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {
    if (ambigs_for_adaption_.empty()) {
      return nullptr;
    }
    return ambigs_for_adaption_[unichar_id];
  }

  // Similar to the above, but return the vector of unichar ids for which
  // the given unichar_id is an ambiguity (appears in the 'wrong' part of
  // some ambiguity pair).
  inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {
    if (reverse_ambigs_for_adaption_.empty()) {
      return nullptr;
    }
    return reverse_ambigs_for_adaption_[unichar_id];
  }

private:
  bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,
                          char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
                          int *replacement_ambig_part_size, char *replacement_string, int *type);
  bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,
                       UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
                       const char *replacement_string, int type, AmbigSpec *ambig_spec,
                       UNICHARSET *unicharset);

  UnicharAmbigsVector dang_ambigs_;
  UnicharAmbigsVector replace_ambigs_;
  std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;
  std::vector<UnicharIdVector *> ambigs_for_adaption_;
  std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;
};

} // namespace tesseract

#endif // !defined(DISABLED_LEGACY_ENGINE)

#endif // TESSERACT_CCUTIL_AMBIGS_H_

Coverage Report

Created: 2025-11-16 06:50

Line	Count	Source
1		///////////////////////////////////////////////////////////////////////
2		// File: ambigs.h
3		// Description: Constants, flags, functions for dealing with
4		// ambiguities (training and recognition).
5		// Author: Daria Antonova
6		//
7		// (C) Copyright 2008, Google Inc.
8		// Licensed under the Apache License, Version 2.0 (the "License");
9		// you may not use this file except in compliance with the License.
10		// You may obtain a copy of the License at
11		// http://www.apache.org/licenses/LICENSE-2.0
12		// Unless required by applicable law or agreed to in writing, software
13		// distributed under the License is distributed on an "AS IS" BASIS,
14		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		// See the License for the specific language governing permissions and
16		// limitations under the License.
17		//
18		///////////////////////////////////////////////////////////////////////
19
20		#ifndef TESSERACT_CCUTIL_AMBIGS_H_
21		#define TESSERACT_CCUTIL_AMBIGS_H_
22
23		#ifdef HAVE_CONFIG_H
24		# include "config_auto.h" // DISABLED_LEGACY_ENGINE
25		#endif
26
27		#if !defined(DISABLED_LEGACY_ENGINE)
28
29		# include <tesseract/unichar.h>
30		# include "elst.h"
31		# include "tprintf.h"
32		# include "unicharset.h"
33
34	152k	# define MAX_AMBIG_SIZE 10
35
36		namespace tesseract {
37
38		using UnicharIdVector = std::vector<UNICHAR_ID>;
39
40		enum AmbigType {
41		NOT_AMBIG, // the ngram pair is not ambiguous
42		REPLACE_AMBIG, // ocred ngram should always be substituted with correct
43		DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
44		SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
45		CASE_AMBIG, // this is a case ambiguity (1-1)
46
47		AMBIG_TYPE_COUNT // number of enum entries
48		};
49
50		// A collection of utility functions for arrays of UNICHAR_IDs that are
51		// terminated by INVALID_UNICHAR_ID.
52		class UnicharIdArrayUtils {
53		public:
54		// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
55		// less than length of array2, if any array1[i] is less than array2[i].
56		// Returns 0 if the arrays are equal, 1 otherwise.
57		// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
58	593M	static inline int compare(const UNICHAR_ID ptr1, const UNICHAR_ID ptr2) {
59	1.19G	for (;;) {
60	1.19G	const UNICHAR_ID val1 = *ptr1++;
61	1.19G	const UNICHAR_ID val2 = *ptr2++;
62	1.19G	if (val1 != val2) {
63	590M	if (val1 == INVALID_UNICHAR_ID) {
64	5.67M	return -1;
65	5.67M	}
66	585M	if (val2 == INVALID_UNICHAR_ID) {
67	116	return 1;
68	116	}
69	585M	if (val1 < val2) {
70	14.4M	return -1;
71	14.4M	}
72	570M	return 1;
73	585M	}
74	600M	if (val1 == INVALID_UNICHAR_ID) {
75	2.09M	return 0;
76	2.09M	}
77	600M	}
78	593M	}
79
80		// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
81		// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
82		// and that dst has enough space for all the elements from src.
83	76.1k	static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
84	76.1k	int i = 0;
85	303k	do {
86	303k	dst[i] = src[i];
87	303k	} while (dst[i++] != INVALID_UNICHAR_ID);
88	76.1k	return i - 1;
89	76.1k	}
90
91		// Prints unichars corresponding to the unichar_ids in the given array.
92		// The function assumes that array is terminated by INVALID_UNICHAR_ID.
93	0	static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {
94	0	const UNICHAR_ID *ptr = array;
95	0	if (*ptr == INVALID_UNICHAR_ID) {
96	0	tprintf("[Empty]");
97	0	}
98	0	while (*ptr != INVALID_UNICHAR_ID) {
99	0	tprintf("%s ", unicharset.id_to_unichar(*ptr++));
100	0	}
101	0	tprintf("( ");
102	0	ptr = array;
103	0	while (*ptr != INVALID_UNICHAR_ID) {
104	0	tprintf("%d ", *ptr++);
105	0	}
106	0	tprintf(")\n");
107	0	}
108		};
109
110		// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
111		// start with the same unichar (e.g. r->t rn->m rr1->m).
112		class AmbigSpec : public ELIST<AmbigSpec>::LINK {
113		public:
114		AmbigSpec();
115		~AmbigSpec() = default;
116
117		// Comparator function for sorting AmbigSpec_LISTs. The lists will
118		// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
119		// in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
120	10.5M	static int compare_ambig_specs(const AmbigSpec s1, const AmbigSpec s2) {
121	10.5M	int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
122	10.5M	if (result != 0) {
123	10.5M	return result;
124	10.5M	}
125	436	return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);
126	10.5M	}
127
128		UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
129		UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
130		UNICHAR_ID correct_ngram_id;
131		AmbigType type;
132		int wrong_ngram_size;
133		};
134		ELISTIZEH(AmbigSpec)
135
136		// AMBIG_TABLE[i] stores a set of ambiguities whose
137		// wrong ngram starts with unichar id i.
138		using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;
139
140		class UnicharAmbigs {
141		public:
142	8	UnicharAmbigs() = default;
143	0	~UnicharAmbigs() {
144	0	for (auto data : replace_ambigs_) {
145	0	delete data;
146	0	}
147	0	for (auto data : dang_ambigs_) {
148	0	delete data;
149	0	}
150	0	for (auto data : one_to_one_definite_ambigs_) {
151	0	delete data;
152	0	}
153	0	}
154
155	817k	const UnicharAmbigsVector &dang_ambigs() const {
156	817k	return dang_ambigs_;
157	817k	}
158	817k	const UnicharAmbigsVector &replace_ambigs() const {
159	817k	return replace_ambigs_;
160	817k	}
161
162		// Initializes the ambigs by adding a nullptr pointer to each table.
163		void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);
164
165		// Loads the universal ambigs that are useful for any language.
166		void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);
167
168		// Fills in two ambiguity tables (replaceable and dangerous) with information
169		// read from the ambigs file. An ambiguity table is an array of lists.
170		// The array is indexed by a class id. Each entry in the table provides
171		// a list of potential ambiguities which can start with the corresponding
172		// character. For example the ambiguity "rn -> m", would be located in the
173		// table at index of unicharset.unichar_to_id('r').
174		// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
175		// one_to_one_definite_ambigs_. This vector is also indexed by the class id
176		// of the wrong part of the ambiguity and each entry contains a vector of
177		// unichar ids that are ambiguous to it.
178		// encoder_set is used to encode the ambiguity strings, undisturbed by new
179		// unichar_ids that may be created by adding the ambigs.
180		void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,
181		bool use_ambigs_for_adaption, UNICHARSET *unicharset);
182
183		// Returns definite 1-1 ambigs for the given unichar id.
184	0	inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
185	0	if (one_to_one_definite_ambigs_.empty()) {
186	0	return nullptr;
187	0	}
188	0	return one_to_one_definite_ambigs_[unichar_id];
189	0	}
190
191		// Returns a pointer to the vector with all unichar ids that appear in the
192		// 'correct' part of the ambiguity pair when the given unichar id appears
193		// in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
194		// m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
195		// m will return a pointer to a vector with unichar ids of r,n,i.
196	0	inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {
197	0	if (ambigs_for_adaption_.empty()) {
198	0	return nullptr;
199	0	}
200	0	return ambigs_for_adaption_[unichar_id];
201	0	}
202
203		// Similar to the above, but return the vector of unichar ids for which
204		// the given unichar_id is an ambiguity (appears in the 'wrong' part of
205		// some ambiguity pair).
206	97	inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {
207	97	if (reverse_ambigs_for_adaption_.empty()) {
208	97	return nullptr;
209	97	}
210	0	return reverse_ambigs_for_adaption_[unichar_id];
211	97	}
212
213		private:
214		bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,
215		char buffer, int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
216		int replacement_ambig_part_size, char replacement_string, int *type);
217		bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,
218		UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
219		const char replacement_string, int type, AmbigSpec ambig_spec,
220		UNICHARSET *unicharset);
221
222		UnicharAmbigsVector dang_ambigs_;
223		UnicharAmbigsVector replace_ambigs_;
224		std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;
225		std::vector<UnicharIdVector *> ambigs_for_adaption_;
226		std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;
227		};
228
229		} // namespace tesseract
230
231		#endif // !defined(DISABLED_LEGACY_ENGINE)
232
233		#endif // TESSERACT_CCUTIL_AMBIGS_H_