Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/classify/classify.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        classify.h
3
// Description: classify class.
4
// Author:      Samuel Charron
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_CLASSIFY_CLASSIFY_H_
20
#define TESSERACT_CLASSIFY_CLASSIFY_H_
21
22
// Include automatically generated configuration file if running autoconf.
23
#ifdef HAVE_CONFIG_H
24
#  include "config_auto.h"
25
#endif
26
27
#ifdef DISABLED_LEGACY_ENGINE
28
29
#  include "ccstruct.h"
30
#  include "dict.h"
31
32
namespace tesseract {
33
34
class Classify : public CCStruct {
35
public:
36
  Classify();
37
  virtual ~Classify();
38
  virtual Dict &getDict() {
39
    return dict_;
40
  }
41
42
  // Member variables.
43
44
  INT_VAR_H(classify_debug_level);
45
  BOOL_VAR_H(classify_bln_numeric_mode);
46
  double_VAR_H(classify_max_rating_ratio);
47
  double_VAR_H(classify_max_certainty_margin);
48
49
private:
50
  Dict dict_;
51
};
52
53
} // namespace tesseract
54
55
#else // DISABLED_LEGACY_ENGINE not defined
56
57
#  include "adaptive.h"
58
#  include "ccstruct.h"
59
#  include "dict.h"
60
#  include "featdefs.h"
61
#  include "fontinfo.h"
62
#  include "intfx.h"
63
#  include "intmatcher.h"
64
#  include "normalis.h"
65
#  include "ocrfeatures.h"
66
#  include "ratngs.h"
67
#  include "unicity_table.h"
68
69
namespace tesseract {
70
71
class ScrollView;
72
class WERD_CHOICE;
73
class WERD_RES;
74
struct ADAPT_RESULTS;
75
struct NORM_PROTOS;
76
77
static const int kUnknownFontinfoId = -1;
78
static const int kBlankFontinfoId = -2;
79
80
class ShapeClassifier;
81
struct ShapeRating;
82
class ShapeTable;
83
struct UnicharRating;
84
85
// How segmented is a blob. In this enum, character refers to a classifiable
86
// unit, but that is too long and character is usually easier to understand.
87
enum CharSegmentationType {
88
  CST_FRAGMENT, // A partial character.
89
  CST_WHOLE,    // A correctly segmented character.
90
  CST_IMPROPER, // More than one but less than 2 characters.
91
  CST_NGRAM     // Multiple characters.
92
};
93
94
class TESS_API Classify : public CCStruct {
95
public:
96
  Classify();
97
  ~Classify() override;
98
7.26M
  virtual Dict &getDict() {
99
7.26M
    return dict_;
100
7.26M
  }
101
102
0
  const ShapeTable *shape_table() const {
103
0
    return shape_table_;
104
0
  }
105
106
  // Takes ownership of the given classifier, and uses it for future calls
107
  // to CharNormClassifier.
108
  void SetStaticClassifier(ShapeClassifier *static_classifier);
109
110
  // Adds a noise classification result that is a bit worse than the worst
111
  // current result, or the worst possible result if no current results.
112
  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
113
114
  // Returns true if the blob is small enough to be a large speckle.
115
  bool LargeSpeckle(const TBLOB &blob);
116
117
  /* adaptive.cpp ************************************************************/
118
  int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId);
119
  // Runs the class pruner from int_templates on the given features, returning
120
  // the number of classes output in results.
121
  //    int_templates          Class pruner tables
122
  //    num_features           Number of features in blob
123
  //    features               Array of features
124
  //    normalization_factors  (input) Array of int_templates->NumClasses fudge
125
  //                           factors from blob normalization process.
126
  //                           (Indexed by CLASS_INDEX)
127
  //    expected_num_features  (input) Array of int_templates->NumClasses
128
  //                           expected number of features for each class.
129
  //                           (Indexed by CLASS_INDEX)
130
  //    results                (output) Sorted Array of pruned classes.
131
  //                           Array must be sized to take the maximum possible
132
  //                           number of outputs : int_templates->NumClasses.
133
  int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this,
134
                   const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors,
135
                   const uint16_t *expected_num_features, std::vector<CP_RESULT_STRUCT> *results);
136
  void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs);
137
  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);
138
  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);
139
  ADAPT_TEMPLATES_STRUCT *ReadAdaptedTemplates(TFile *File);
140
  /* normmatch.cpp ************************************************************/
141
  float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch);
142
  void FreeNormProtos();
143
  NORM_PROTOS *ReadNormProtos(TFile *fp);
144
  /* protos.cpp ***************************************************************/
145
  void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class);
146
  INT_TEMPLATES_STRUCT *CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset);
147
  /* adaptmatch.cpp ***********************************************************/
148
149
  // Learns the given word using its chopped_word, seam_array, denorm,
150
  // box_word, best_state, and correct_text to learn both correctly and
151
  // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
152
  // is called and the data will be saved in an internal buffer.
153
  // Otherwise AdaptToBlob is called for adaption within a document.
154
  void LearnWord(const char *fontname, WERD_RES *word);
155
156
  // Builds a blob of length fragments, from the word, starting at start,
157
  // and then learns it, as having the given correct_text.
158
  // If fontname is not nullptr, then LearnBlob is called and the data will be
159
  // saved in an internal buffer for static training.
160
  // Otherwise AdaptToBlob is called for adaption within a document.
161
  // threshold is a magic number required by AdaptToChar and generated by
162
  // ComputeAdaptionThresholds.
163
  // Although it can be partly inferred from the string, segmentation is
164
  // provided to explicitly clarify the character segmentation.
165
  void LearnPieces(const char *fontname, int start, int length, float threshold,
166
                   CharSegmentationType segmentation, const char *correct_text, WERD_RES *word);
167
  void InitAdaptiveClassifier(TessdataManager *mgr);
168
  void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
169
                        ADAPT_TEMPLATES_STRUCT *Templates);
170
  void AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
171
                       const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
172
                       INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities,
173
                       ADAPT_RESULTS *results);
174
  void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
175
                     const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
176
                     ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box,
177
                     const std::vector<CP_RESULT_STRUCT> &results, ADAPT_RESULTS *final_results);
178
  // Converts configs to fonts, and if the result is not adapted, and a
179
  // shape_table_ is present, the shape is expanded to include all
180
  // unichar_ids represented, before applying a set of corrections to the
181
  // distance rating in int_result, (see ComputeCorrectedRating.)
182
  // The results are added to the final_results output.
183
  void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom,
184
                                       int top, float cp_rating, int blob_length,
185
                                       int matcher_multiplier, const uint8_t *cn_factors,
186
                                       UnicharRating *int_result, ADAPT_RESULTS *final_results);
187
  // Applies a set of corrections to the distance im_rating,
188
  // including the cn_correction, miss penalty and additional penalty
189
  // for non-alnums being vertical misfits. Returns the corrected distance.
190
  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating,
191
                                int feature_misses, int bottom, int top, int blob_length,
192
                                int matcher_multiplier, const uint8_t *cn_factors);
193
  void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results,
194
                               BLOB_CHOICE_LIST *Choices);
195
  void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results);
196
  int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures);
197
198
#  ifndef GRAPHICS_DISABLED
199
  void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results);
200
#  endif
201
  PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],
202
                             INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask);
203
  int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,
204
                             int NumFeatures, INT_FEATURE_ARRAY Features,
205
                             FEATURE_SET FloatFeatures);
206
  void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob);
207
  void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results);
208
  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
209
  void RemoveBadMatches(ADAPT_RESULTS *Results);
210
  void SetAdaptiveThreshold(float Threshold);
211
  void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features);
212
  // Returns a string for the classifier class_id: either the corresponding
213
  // unicharset debug_str or the shape_table_ debug str.
214
  std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
215
                                int config_id) const;
216
  // Converts a classifier class_id index with a config ID to:
217
  // shape_table_ present: a shape_table_ index OR
218
  // No shape_table_: a font ID.
219
  // Without shape training, each class_id, config pair represents a single
220
  // unichar id/font combination, so this function looks up the corresponding
221
  // font id.
222
  // With shape training, each class_id, config pair represents a single
223
  // shape table index, so the fontset_table stores the shape table index,
224
  // and the shape_table_ must be consulted to obtain the actual unichar_id/
225
  // font combinations that the shape represents.
226
  int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const;
227
  // Converts a shape_table_ index to a classifier class_id index (not a
228
  // unichar-id!). Uses a search, so not fast.
229
  int ShapeIDToClassID(int shape_id) const;
230
  UNICHAR_ID *BaselineClassifier(TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT> &int_features,
231
                                 const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates,
232
                                 ADAPT_RESULTS *Results);
233
  int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results);
234
235
  // As CharNormClassifier, but operates on a TrainingSample and outputs to
236
  // a vector of ShapeRating without conversion to classes.
237
  int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
238
                             std::vector<UnicharRating> *results);
239
  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
240
  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
241
  void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
242
                   ADAPT_TEMPLATES_STRUCT *adaptive_templates);
243
  void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class);
244
  bool AdaptableWord(WERD_RES *word);
245
  void EndAdaptiveClassifier();
246
  void SetupPass1();
247
  void SetupPass2();
248
  void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);
249
  void ClassifyAsNoise(ADAPT_RESULTS *Results);
250
  void ResetAdaptiveClassifierInternal();
251
  void SwitchAdaptiveClassifier();
252
  void StartBackupAdaptiveClassifier();
253
254
  int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,
255
                         uint8_t *pruner_norm_array, uint8_t *char_norm_array);
256
  // Computes the char_norm_array for the unicharset and, if not nullptr, the
257
  // pruner_array as appropriate according to the existence of the shape_table.
258
  // The norm_feature is deleted as it is almost certainly no longer needed.
259
  void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
260
                             uint8_t *char_norm_array, uint8_t *pruner_array);
261
262
  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config);
263
  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
264
265
7.06k
  bool AdaptiveClassifierIsFull() const {
266
7.06k
    return NumAdaptationsFailed > 0;
267
7.06k
  }
268
7.06k
  bool AdaptiveClassifierIsEmpty() const {
269
7.06k
    return AdaptedTemplates->NumPermClasses == 0;
270
7.06k
  }
271
  bool LooksLikeGarbage(TBLOB *blob);
272
#ifndef GRAPHICS_DISABLED
273
  void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox);
274
#endif
275
  // intfx.cpp
276
  // Computes the DENORMS for bl(baseline) and cn(character) normalization
277
  // during feature extraction. The input denorm describes the current state
278
  // of the blob, which is usually a baseline-normalized word.
279
  // The Transforms setup are as follows:
280
  // Baseline Normalized (bl) Output:
281
  //   We center the grapheme by aligning the x-coordinate of its centroid with
282
  //   x=128 and leaving the already-baseline-normalized y as-is.
283
  //
284
  // Character Normalized (cn) Output:
285
  //   We align the grapheme's centroid at the origin and scale it
286
  //   asymmetrically in x and y so that the 2nd moments are a standard value
287
  //   (51.2) ie the result is vaguely square.
288
  // If classify_nonlinear_norm is true:
289
  //   A non-linear normalization is setup that attempts to evenly distribute
290
  //   edges across x and y.
291
  //
292
  // Some of the fields of fx_info are also setup:
293
  // Length: Total length of outline.
294
  // Rx:     Rounded y second moment. (Reversed by convention.)
295
  // Ry:     rounded x second moment.
296
  // Xmean:  Rounded x center of mass of the blob.
297
  // Ymean:  Rounded y center of mass of the blob.
298
  static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm,
299
                               DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info);
300
301
  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
302
  // (x,y) position and angle as measured counterclockwise from the vector
303
  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
304
  // cn_denorm. See SetpuBLCNDenorms for definitions.
305
  // If outline_cn_counts is not nullptr, on return it contains the cumulative
306
  // number of cn features generated for each outline in the blob (in order).
307
  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
308
  // after the second outline, there were (*outline_cn_counts)[1] features etc.
309
  static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm,
310
                              std::vector<INT_FEATURE_STRUCT> *bl_features,
311
                              std::vector<INT_FEATURE_STRUCT> *cn_features,
312
                              INT_FX_RESULT_STRUCT *results, std::vector<int> *outline_cn_counts);
313
  /* float2int.cpp ************************************************************/
314
  void ClearCharNormArray(uint8_t *char_norm_array);
315
  void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array);
316
  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
317
  /* intproto.cpp *************************************************************/
318
  INT_TEMPLATES_STRUCT *ReadIntTemplates(TFile *fp);
319
  void WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset);
320
  CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on,
321
                           int *shape_id);
322
  void ShowMatchDisplay();
323
  /* font detection ***********************************************************/
324
6
  UnicityTable<FontInfo> &get_fontinfo_table() {
325
6
    return fontinfo_table_;
326
6
  }
327
7.52k
  const UnicityTable<FontInfo> &get_fontinfo_table() const {
328
7.52k
    return fontinfo_table_;
329
7.52k
  }
330
0
  UnicityTable<FontSet> &get_fontset_table() {
331
0
    return fontset_table_;
332
0
  }
333
  /* mfoutline.cpp ***********************************************************/
334
  void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale);
335
  /* outfeat.cpp ***********************************************************/
336
  FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
337
  /* picofeat.cpp ***********************************************************/
338
  FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
339
  FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);
340
  FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);
341
  /* blobclass.cpp ***********************************************************/
342
  // Extracts features from the given blob and saves them in the tr_file_data_
343
  // member variable.
344
  // fontname:  Name of font that this blob was printed in.
345
  // cn_denorm: Character normalization transformation to apply to the blob.
346
  // fx_info:   Character normalization parameters computed with cn_denorm.
347
  // blob_text: Ground truth text for the blob.
348
  void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm,
349
                 const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text);
350
  // Writes stored training data to a .tr file based on the given filename.
351
  // Returns false on error.
352
  bool WriteTRFile(const char *filename);
353
354
  // Member variables.
355
356
  // Parameters.
357
  // Set during training (in lang.config) to indicate whether the divisible
358
  // blobs chopper should be used (true for latin script.)
359
  BOOL_VAR_H(allow_blob_division);
360
  // Set during training (in lang.config) to indicate whether the divisible
361
  // blobs chopper should be used in preference to chopping. Set to true for
362
  // southern Indic scripts.
363
  BOOL_VAR_H(prioritize_division);
364
  BOOL_VAR_H(classify_enable_learning);
365
  INT_VAR_H(classify_debug_level);
366
367
  /* mfoutline.cpp ***********************************************************/
368
  /* control knobs used to control normalization of outlines */
369
  INT_VAR_H(classify_norm_method);
370
  double_VAR_H(classify_char_norm_range);
371
  double_VAR_H(classify_max_rating_ratio);
372
  double_VAR_H(classify_max_certainty_margin);
373
374
  /* adaptmatch.cpp ***********************************************************/
375
  BOOL_VAR_H(tess_cn_matching);
376
  BOOL_VAR_H(tess_bn_matching);
377
  BOOL_VAR_H(classify_enable_adaptive_matcher);
378
  BOOL_VAR_H(classify_use_pre_adapted_templates);
379
  BOOL_VAR_H(classify_save_adapted_templates);
380
  BOOL_VAR_H(classify_enable_adaptive_debugger);
381
  BOOL_VAR_H(classify_nonlinear_norm);
382
  INT_VAR_H(matcher_debug_level);
383
  INT_VAR_H(matcher_debug_flags);
384
  INT_VAR_H(classify_learning_debug_level);
385
  double_VAR_H(matcher_good_threshold);
386
  double_VAR_H(matcher_reliable_adaptive_result);
387
  double_VAR_H(matcher_perfect_threshold);
388
  double_VAR_H(matcher_bad_match_pad);
389
  double_VAR_H(matcher_rating_margin);
390
  double_VAR_H(matcher_avg_noise_size);
391
  INT_VAR_H(matcher_permanent_classes_min);
392
  INT_VAR_H(matcher_min_examples_for_prototyping);
393
  INT_VAR_H(matcher_sufficient_examples_for_prototyping);
394
  double_VAR_H(matcher_clustering_max_angle_delta);
395
  double_VAR_H(classify_misfit_junk_penalty);
396
  double_VAR_H(rating_scale);
397
  double_VAR_H(tessedit_class_miss_scale);
398
  double_VAR_H(classify_adapted_pruning_factor);
399
  double_VAR_H(classify_adapted_pruning_threshold);
400
  INT_VAR_H(classify_adapt_proto_threshold);
401
  INT_VAR_H(classify_adapt_feature_threshold);
402
  BOOL_VAR_H(disable_character_fragments);
403
  double_VAR_H(classify_character_fragments_garbage_certainty_threshold);
404
  BOOL_VAR_H(classify_debug_character_fragments);
405
  BOOL_VAR_H(matcher_debug_separate_windows);
406
  STRING_VAR_H(classify_learn_debug_str);
407
408
  /* intmatcher.cpp **********************************************************/
409
  INT_VAR_H(classify_class_pruner_threshold);
410
  INT_VAR_H(classify_class_pruner_multiplier);
411
  INT_VAR_H(classify_cp_cutoff_strength);
412
  INT_VAR_H(classify_integer_matcher_multiplier);
413
414
  BOOL_VAR_H(classify_bln_numeric_mode);
415
  double_VAR_H(speckle_large_max_size);
416
  double_VAR_H(speckle_rating_penalty);
417
418
  // Use class variables to hold onto built-in templates and adapted templates.
419
  INT_TEMPLATES_STRUCT *PreTrainedTemplates = nullptr;
420
  ADAPT_TEMPLATES_STRUCT *AdaptedTemplates = nullptr;
421
  // The backup adapted templates are created from the previous page (only)
422
  // so they are always ready and reasonably well trained if the primary
423
  // adapted templates become full.
424
  ADAPT_TEMPLATES_STRUCT *BackupAdaptedTemplates = nullptr;
425
426
  // Create dummy proto and config masks for use with the built-in templates.
427
  BIT_VECTOR AllProtosOn = nullptr;
428
  BIT_VECTOR AllConfigsOn = nullptr;
429
  BIT_VECTOR AllConfigsOff = nullptr;
430
  BIT_VECTOR TempProtoMask = nullptr;
431
  /* normmatch.cpp */
432
  NORM_PROTOS *NormProtos = nullptr;
433
  /* font detection ***********************************************************/
434
  UnicityTable<FontInfo> fontinfo_table_;
435
  // Without shape training, each class_id, config pair represents a single
436
  // unichar id/font combination, so each fontset_table_ entry holds font ids
437
  // for each config in the class.
438
  // With shape training, each class_id, config pair represents a single
439
  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
440
  // and the shape_table_ must be consulted to obtain the actual unichar_id/
441
  // font combinations that the shape represents.
442
  UnicityTable<FontSet> fontset_table_;
443
444
protected:
445
  IntegerMatcher im_;
446
  FEATURE_DEFS_STRUCT feature_defs_;
447
  // If a shape_table_ is present, it is used to remap classifier output in
448
  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
449
  // mean an index to the shape_table_ and the choices returned are *all* the
450
  // shape_table_ entries at that index.
451
  ShapeTable *shape_table_ = nullptr;
452
453
private:
454
  // The currently active static classifier.
455
  ShapeClassifier *static_classifier_ = nullptr;
456
#ifndef GRAPHICS_DISABLED
457
  ScrollView *learn_debug_win_ = nullptr;
458
  ScrollView *learn_fragmented_word_debug_win_ = nullptr;
459
  ScrollView *learn_fragments_debug_win_ = nullptr;
460
#endif
461
462
  // Training data gathered here for all the images in a document.
463
  std::string tr_file_data_;
464
465
  Dict dict_;
466
467
  std::vector<uint16_t> shapetable_cutoffs_;
468
469
  /* variables used to hold performance statistics */
470
  int NumAdaptationsFailed = 0;
471
472
  // Expected number of features in the class pruner, used to penalize
473
  // unknowns that have too few features (like a c being classified as e) so
474
  // it doesn't recognize everything as '@' or '#'.
475
  // CharNormCutoffs is for the static classifier (with no shapetable).
476
  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
477
  // value in the adaptive classifier. Both are indexed by unichar_id.
478
  // shapetable_cutoffs_ provides a similar value for each shape in the
479
  // shape_table_
480
  uint16_t CharNormCutoffs[MAX_NUM_CLASSES];
481
  uint16_t BaselineCutoffs[MAX_NUM_CLASSES];
482
483
public:
484
  bool EnableLearning = true;
485
};
486
487
} // namespace tesseract
488
489
#endif // DISABLED_LEGACY_ENGINE
490
491
#endif // TESSERACT_CLASSIFY_CLASSIFY_H_