/src/tesseract/src/ccstruct/params_training_featdef.h

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        params_training_featdef.h
// Description: Feature definitions for params training.
// Author:      Rika Antonova
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_

#include <cstring> // for memset
#include <string>
#include <vector>

namespace tesseract {

// Maximum number of unichars in the small and medium sized words
static const int kMaxSmallWordUnichars = 3;
static const int kMaxMediumWordUnichars = 6;

// Raw features extracted from a single OCR hypothesis.
// The features are normalized (by outline length or number of unichars as
// appropriate) real-valued quantities with unbounded range and
// unknown distribution.
// Normalization / binarization of these features is done at a later stage.
// Note: when adding new fields to this enum make sure to modify
// kParamsTrainingFeatureTypeName
enum kParamsTrainingFeatureType {
  // Digits
  PTRAIN_DIGITS_SHORT, // 0
  PTRAIN_DIGITS_MED,   // 1
  PTRAIN_DIGITS_LONG,  // 2
  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
  PTRAIN_NUM_SHORT, // 3
  PTRAIN_NUM_MED,   // 4
  PTRAIN_NUM_LONG,  // 5
  // Document word (DOC_DAWG_PERM)
  PTRAIN_DOC_SHORT, // 6
  PTRAIN_DOC_MED,   // 7
  PTRAIN_DOC_LONG,  // 8
  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
  PTRAIN_DICT_SHORT, // 9
  PTRAIN_DICT_MED,   // 10
  PTRAIN_DICT_LONG,  // 11
  // Frequent word (FREQ_DAWG_PERM)
  PTRAIN_FREQ_SHORT,          // 12
  PTRAIN_FREQ_MED,            // 13
  PTRAIN_FREQ_LONG,           // 14
  PTRAIN_SHAPE_COST_PER_CHAR, // 15
  PTRAIN_NGRAM_COST_PER_CHAR, // 16
  PTRAIN_NUM_BAD_PUNC,        // 17
  PTRAIN_NUM_BAD_CASE,        // 18
  PTRAIN_XHEIGHT_CONSISTENCY, // 19
  PTRAIN_NUM_BAD_CHAR_TYPE,   // 20
  PTRAIN_NUM_BAD_SPACING,     // 21
  PTRAIN_NUM_BAD_FONT,        // 22
  PTRAIN_RATING_PER_CHAR,     // 23

  PTRAIN_NUM_FEATURE_TYPES
};

static const char *const kParamsTrainingFeatureTypeName[] = {
    "PTRAIN_DIGITS_SHORT",        // 0
    "PTRAIN_DIGITS_MED",          // 1
    "PTRAIN_DIGITS_LONG",         // 2
    "PTRAIN_NUM_SHORT",           // 3
    "PTRAIN_NUM_MED",             // 4
    "PTRAIN_NUM_LONG",            // 5
    "PTRAIN_DOC_SHORT",           // 6
    "PTRAIN_DOC_MED",             // 7
    "PTRAIN_DOC_LONG",            // 8
    "PTRAIN_DICT_SHORT",          // 9
    "PTRAIN_DICT_MED",            // 10
    "PTRAIN_DICT_LONG",           // 11
    "PTRAIN_FREQ_SHORT",          // 12
    "PTRAIN_FREQ_MED",            // 13
    "PTRAIN_FREQ_LONG",           // 14
    "PTRAIN_SHAPE_COST_PER_CHAR", // 15
    "PTRAIN_NGRAM_COST_PER_CHAR", // 16
    "PTRAIN_NUM_BAD_PUNC",        // 17
    "PTRAIN_NUM_BAD_CASE",        // 18
    "PTRAIN_XHEIGHT_CONSISTENCY", // 19
    "PTRAIN_NUM_BAD_CHAR_TYPE",   // 20
    "PTRAIN_NUM_BAD_SPACING",     // 21
    "PTRAIN_NUM_BAD_FONT",        // 22
    "PTRAIN_RATING_PER_CHAR",     // 23
};

// Returns the index of the given feature (by name),
// or -1 meaning the feature is unknown.
int ParamsTrainingFeatureByName(const char *name);

// Entry with features extracted from a single OCR hypothesis for a word.
struct ParamsTrainingHypothesis {
  ParamsTrainingHypothesis() : cost(0.0) {
    memset(features, 0, sizeof(features));
  }
  ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
    memcpy(features, other.features, sizeof(features));
    str = other.str;
    cost = other.cost;
  }
  ParamsTrainingHypothesis &operator=(const ParamsTrainingHypothesis &other) {
    memcpy(features, other.features, sizeof(features));
    str = other.str;
    cost = other.cost;
    return *this;
  }
  std::string str; // string corresponding to word hypothesis (for debugging)
  float features[PTRAIN_NUM_FEATURE_TYPES];
  float cost; // path cost computed by segsearch
};

// A list of hypotheses explored during one run of segmentation search.
using ParamsTrainingHypothesisList = std::vector<ParamsTrainingHypothesis>;

// A bundle that accumulates all of the hypothesis lists explored during all
// of the runs of segmentation search on a word (e.g. a list of hypotheses
// explored on PASS1, PASS2, fix xheight pass, etc).
class ParamsTrainingBundle {
public:
  ParamsTrainingBundle() = default;
  // Starts a new hypothesis list.
  // Should be called at the beginning of a new run of the segmentation search.
  void StartHypothesisList() {
    hyp_list_vec.emplace_back();
  }
  // Adds a new ParamsTrainingHypothesis to the current hypothesis list
  // and returns the reference to the newly added entry.
  ParamsTrainingHypothesis &AddHypothesis(const ParamsTrainingHypothesis &other) {
    if (hyp_list_vec.empty()) {
      StartHypothesisList();
    }
    hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
    return hyp_list_vec.back().back();
  }

  std::vector<ParamsTrainingHypothesisList> hyp_list_vec;
};

} // namespace tesseract

#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_

Coverage Report

Created: 2025-06-13 07:15

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: params_training_featdef.h
3		// Description: Feature definitions for params training.
4		// Author: Rika Antonova
5		//
6		// (C) Copyright 2011, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		//
17		///////////////////////////////////////////////////////////////////////
18
19		#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
20		#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
21
22		#include <cstring> // for memset
23		#include <string>
24		#include <vector>
25
26		namespace tesseract {
27
28		// Maximum number of unichars in the small and medium sized words
29		static const int kMaxSmallWordUnichars = 3;
30		static const int kMaxMediumWordUnichars = 6;
31
32		// Raw features extracted from a single OCR hypothesis.
33		// The features are normalized (by outline length or number of unichars as
34		// appropriate) real-valued quantities with unbounded range and
35		// unknown distribution.
36		// Normalization / binarization of these features is done at a later stage.
37		// Note: when adding new fields to this enum make sure to modify
38		// kParamsTrainingFeatureTypeName
39		enum kParamsTrainingFeatureType {
40		// Digits
41		PTRAIN_DIGITS_SHORT, // 0
42		PTRAIN_DIGITS_MED, // 1
43		PTRAIN_DIGITS_LONG, // 2
44		// Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
45		PTRAIN_NUM_SHORT, // 3
46		PTRAIN_NUM_MED, // 4
47		PTRAIN_NUM_LONG, // 5
48		// Document word (DOC_DAWG_PERM)
49		PTRAIN_DOC_SHORT, // 6
50		PTRAIN_DOC_MED, // 7
51		PTRAIN_DOC_LONG, // 8
52		// Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
53		PTRAIN_DICT_SHORT, // 9
54		PTRAIN_DICT_MED, // 10
55		PTRAIN_DICT_LONG, // 11
56		// Frequent word (FREQ_DAWG_PERM)
57		PTRAIN_FREQ_SHORT, // 12
58		PTRAIN_FREQ_MED, // 13
59		PTRAIN_FREQ_LONG, // 14
60		PTRAIN_SHAPE_COST_PER_CHAR, // 15
61		PTRAIN_NGRAM_COST_PER_CHAR, // 16
62		PTRAIN_NUM_BAD_PUNC, // 17
63		PTRAIN_NUM_BAD_CASE, // 18
64		PTRAIN_XHEIGHT_CONSISTENCY, // 19
65		PTRAIN_NUM_BAD_CHAR_TYPE, // 20
66		PTRAIN_NUM_BAD_SPACING, // 21
67		PTRAIN_NUM_BAD_FONT, // 22
68		PTRAIN_RATING_PER_CHAR, // 23
69
70		PTRAIN_NUM_FEATURE_TYPES
71		};
72
73		static const char *const kParamsTrainingFeatureTypeName[] = {
74		"PTRAIN_DIGITS_SHORT", // 0
75		"PTRAIN_DIGITS_MED", // 1
76		"PTRAIN_DIGITS_LONG", // 2
77		"PTRAIN_NUM_SHORT", // 3
78		"PTRAIN_NUM_MED", // 4
79		"PTRAIN_NUM_LONG", // 5
80		"PTRAIN_DOC_SHORT", // 6
81		"PTRAIN_DOC_MED", // 7
82		"PTRAIN_DOC_LONG", // 8
83		"PTRAIN_DICT_SHORT", // 9
84		"PTRAIN_DICT_MED", // 10
85		"PTRAIN_DICT_LONG", // 11
86		"PTRAIN_FREQ_SHORT", // 12
87		"PTRAIN_FREQ_MED", // 13
88		"PTRAIN_FREQ_LONG", // 14
89		"PTRAIN_SHAPE_COST_PER_CHAR", // 15
90		"PTRAIN_NGRAM_COST_PER_CHAR", // 16
91		"PTRAIN_NUM_BAD_PUNC", // 17
92		"PTRAIN_NUM_BAD_CASE", // 18
93		"PTRAIN_XHEIGHT_CONSISTENCY", // 19
94		"PTRAIN_NUM_BAD_CHAR_TYPE", // 20
95		"PTRAIN_NUM_BAD_SPACING", // 21
96		"PTRAIN_NUM_BAD_FONT", // 22
97		"PTRAIN_RATING_PER_CHAR", // 23
98		};
99
100		// Returns the index of the given feature (by name),
101		// or -1 meaning the feature is unknown.
102		int ParamsTrainingFeatureByName(const char *name);
103
104		// Entry with features extracted from a single OCR hypothesis for a word.
105		struct ParamsTrainingHypothesis {
106	399k	ParamsTrainingHypothesis() : cost(0.0) {
107	399k	memset(features, 0, sizeof(features));
108	399k	}
109	0	ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
110	0	memcpy(features, other.features, sizeof(features));
111	0	str = other.str;
112	0	cost = other.cost;
113	0	}
114	0	ParamsTrainingHypothesis &operator=(const ParamsTrainingHypothesis &other) {
115	0	memcpy(features, other.features, sizeof(features));
116	0	str = other.str;
117	0	cost = other.cost;
118	0	return *this;
119	0	}
120		std::string str; // string corresponding to word hypothesis (for debugging)
121		float features[PTRAIN_NUM_FEATURE_TYPES];
122		float cost; // path cost computed by segsearch
123		};
124
125		// A list of hypotheses explored during one run of segmentation search.
126		using ParamsTrainingHypothesisList = std::vector<ParamsTrainingHypothesis>;
127
128		// A bundle that accumulates all of the hypothesis lists explored during all
129		// of the runs of segmentation search on a word (e.g. a list of hypotheses
130		// explored on PASS1, PASS2, fix xheight pass, etc).
131		class ParamsTrainingBundle {
132		public:
133	0	ParamsTrainingBundle() = default;
134		// Starts a new hypothesis list.
135		// Should be called at the beginning of a new run of the segmentation search.
136	0	void StartHypothesisList() {
137	0	hyp_list_vec.emplace_back();
138	0	}
139		// Adds a new ParamsTrainingHypothesis to the current hypothesis list
140		// and returns the reference to the newly added entry.
141	0	ParamsTrainingHypothesis &AddHypothesis(const ParamsTrainingHypothesis &other) {
142	0	if (hyp_list_vec.empty()) {
143	0	StartHypothesisList();
144	0	}
145	0	hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
146	0	return hyp_list_vec.back().back();
147	0	}
148
149		std::vector<ParamsTrainingHypothesisList> hyp_list_vec;
150		};
151
152		} // namespace tesseract
153
154		#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_