Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccutil/tessdatamanager.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        tessdatamanager.h
3
// Description: Functions to handle loading/combining tesseract data files.
4
// Author:      Daria Antonova
5
//
6
// (C) Copyright 2009, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20
#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21
22
#include <tesseract/baseapi.h> // FileReader
23
#include <string>              // std::string
24
#include <vector>              // std::vector
25
#include "serialis.h"          // FileWriter
26
27
static const char kTrainedDataSuffix[] = "traineddata";
28
29
// When adding new tessdata types and file suffixes, please make sure to
30
// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31
static const char kLangConfigFileSuffix[] = "config";
32
static const char kUnicharsetFileSuffix[] = "unicharset";
33
static const char kAmbigsFileSuffix[] = "unicharambigs";
34
static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35
static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36
static const char kNormProtoFileSuffix[] = "normproto";
37
static const char kPuncDawgFileSuffix[] = "punc-dawg";
38
static const char kSystemDawgFileSuffix[] = "word-dawg";
39
static const char kNumberDawgFileSuffix[] = "number-dawg";
40
static const char kFreqDawgFileSuffix[] = "freq-dawg";
41
static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42
static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43
static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44
static const char kShapeTableFileSuffix[] = "shapetable";
45
static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46
static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47
static const char kParamsModelFileSuffix[] = "params-model";
48
static const char kLSTMModelFileSuffix[] = "lstm";
49
static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
50
static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
51
static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
52
static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
53
static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
54
static const char kVersionFileSuffix[] = "version";
55
56
namespace tesseract {
57
58
enum TessdataType {
59
  TESSDATA_LANG_CONFIG,        // 0
60
  TESSDATA_UNICHARSET,         // 1
61
  TESSDATA_AMBIGS,             // 2
62
  TESSDATA_INTTEMP,            // 3
63
  TESSDATA_PFFMTABLE,          // 4
64
  TESSDATA_NORMPROTO,          // 5
65
  TESSDATA_PUNC_DAWG,          // 6
66
  TESSDATA_SYSTEM_DAWG,        // 7
67
  TESSDATA_NUMBER_DAWG,        // 8
68
  TESSDATA_FREQ_DAWG,          // 9
69
  TESSDATA_FIXED_LENGTH_DAWGS, // 10  // deprecated
70
  TESSDATA_CUBE_UNICHARSET,    // 11  // deprecated
71
  TESSDATA_CUBE_SYSTEM_DAWG,   // 12  // deprecated
72
  TESSDATA_SHAPE_TABLE,        // 13
73
  TESSDATA_BIGRAM_DAWG,        // 14
74
  TESSDATA_UNAMBIG_DAWG,       // 15
75
  TESSDATA_PARAMS_MODEL,       // 16
76
  TESSDATA_LSTM,               // 17
77
  TESSDATA_LSTM_PUNC_DAWG,     // 18
78
  TESSDATA_LSTM_SYSTEM_DAWG,   // 19
79
  TESSDATA_LSTM_NUMBER_DAWG,   // 20
80
  TESSDATA_LSTM_UNICHARSET,    // 21
81
  TESSDATA_LSTM_RECODER,       // 22
82
  TESSDATA_VERSION,            // 23
83
84
  TESSDATA_NUM_ENTRIES
85
};
86
87
/**
88
 * kTessdataFileSuffixes[i] indicates the file suffix for
89
 * tessdata of type i (from TessdataType enum).
90
 */
91
static const char *const kTessdataFileSuffixes[] = {
92
    kLangConfigFileSuffix,       // 0
93
    kUnicharsetFileSuffix,       // 1
94
    kAmbigsFileSuffix,           // 2
95
    kBuiltInTemplatesFileSuffix, // 3
96
    kBuiltInCutoffsFileSuffix,   // 4
97
    kNormProtoFileSuffix,        // 5
98
    kPuncDawgFileSuffix,         // 6
99
    kSystemDawgFileSuffix,       // 7
100
    kNumberDawgFileSuffix,       // 8
101
    kFreqDawgFileSuffix,         // 9
102
    kFixedLengthDawgsFileSuffix, // 10  // deprecated
103
    kCubeUnicharsetFileSuffix,   // 11  // deprecated
104
    kCubeSystemDawgFileSuffix,   // 12  // deprecated
105
    kShapeTableFileSuffix,       // 13
106
    kBigramDawgFileSuffix,       // 14
107
    kUnambigDawgFileSuffix,      // 15
108
    kParamsModelFileSuffix,      // 16
109
    kLSTMModelFileSuffix,        // 17
110
    kLSTMPuncDawgFileSuffix,     // 18
111
    kLSTMSystemDawgFileSuffix,   // 19
112
    kLSTMNumberDawgFileSuffix,   // 20
113
    kLSTMUnicharsetFileSuffix,   // 21
114
    kLSTMRecoderFileSuffix,      // 22
115
    kVersionFileSuffix,          // 23
116
};
117
118
/**
119
 * TessdataType could be updated to contain more entries, however
120
 * we do not expect that number to be astronomically high.
121
 * In order to automatically detect endianness TessdataManager will
122
 * flip the bits if actual_tessdata_num_entries_ is larger than
123
 * kMaxNumTessdataEntries.
124
 */
125
static const int kMaxNumTessdataEntries = 1000;
126
127
class TESS_API TessdataManager {
128
public:
129
  TessdataManager();
130
  explicit TessdataManager(FileReader reader);
131
132
2
  ~TessdataManager() = default;
133
134
0
  bool swap() const {
135
0
    return swap_;
136
0
  }
137
2
  bool is_loaded() const {
138
2
    return is_loaded_;
139
2
  }
140
141
  // Lazily loads from the given filename. Won't actually read the file
142
  // until it needs it.
143
  void LoadFileLater(const char *data_file_name);
144
  /**
145
   * Opens and reads the given data file right now.
146
   * @return true on success.
147
   */
148
  bool Init(const char *data_file_name);
149
  // Loads from the given memory buffer as if a file, remembering name as some
150
  // arbitrary source id for caching.
151
  bool LoadMemBuffer(const char *name, const char *data, int size);
152
  // Overwrites a single entry of the given type.
153
  void OverwriteEntry(TessdataType type, const char *data, int size);
154
155
  // Saves to the given filename.
156
  bool SaveFile(const char *filename, FileWriter writer) const;
157
  // Serializes to the given vector.
158
  void Serialize(std::vector<char> *data) const;
159
  // Resets to the initial state, keeping the reader.
160
  void Clear();
161
162
  // Prints a directory of contents.
163
  void Directory() const;
164
165
  // Returns true if the component requested is present.
166
6
  bool IsComponentAvailable(TessdataType type) const {
167
6
    return !entries_[type].empty();
168
6
  }
169
  // Opens the given TFile pointer to the given component type.
170
  // Returns false in case of failure.
171
  bool GetComponent(TessdataType type, TFile *fp);
172
  // As non-const version except it can't load the component if not already
173
  // loaded.
174
  bool GetComponent(TessdataType type, TFile *fp) const;
175
176
  // Returns the current version string.
177
  std::string VersionString() const;
178
  // Sets the version string to the given v_str.
179
  void SetVersionString(const std::string &v_str);
180
181
  // Returns true if the base Tesseract components are present.
182
2
  bool IsBaseAvailable() const {
183
2
    return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184
2
  }
185
186
  // Returns true if the LSTM components are present.
187
2
  bool IsLSTMAvailable() const {
188
2
    return !entries_[TESSDATA_LSTM].empty();
189
2
  }
190
191
  // Return the name of the underlying data file.
192
18
  const std::string &GetDataFileName() const {
193
18
    return data_file_name_;
194
18
  }
195
196
  /**
197
   * Reads all the standard tesseract config and data files for a language
198
   * at the given path and bundles them up into one binary data file.
199
   * Returns true if the combined traineddata file was successfully written.
200
   */
201
  bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
202
203
  /**
204
   * Gets the individual components from the data_file_ with which the class was
205
   * initialized. Overwrites the components specified by component_filenames.
206
   * Writes the updated traineddata file to new_traineddata_filename.
207
   */
208
  bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
209
                           int num_new_components);
210
211
  /**
212
   * Extracts tessdata component implied by the name of the input file from
213
   * the combined traineddata loaded into TessdataManager.
214
   * Writes the extracted component to the file indicated by the file name.
215
   * E.g. if the filename given is somepath/somelang.unicharset, unicharset
216
   * will be extracted from the data loaded into the TessdataManager and will
217
   * be written to somepath/somelang.unicharset.
218
   * @return true if the component was successfully extracted, false if the
219
   * component was not present in the traineddata loaded into TessdataManager.
220
   */
221
  bool ExtractToFile(const char *filename);
222
223
private:
224
  // Use libarchive.
225
  bool LoadArchiveFile(const char *filename);
226
227
  /**
228
   * Fills type with TessdataType of the tessdata component represented by the
229
   * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
230
   * @return true if the tessdata component type could be determined
231
   * from the given file name.
232
   */
233
  static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
234
235
  /**
236
   * Tries to determine tessdata component file suffix from filename,
237
   * returns true on success.
238
   */
239
  static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
240
241
  // Name of file it came from.
242
  std::string data_file_name_;
243
  // Function to load the file when we need it.
244
  FileReader reader_;
245
  // True if the file has been loaded.
246
  bool is_loaded_;
247
  // True if the bytes need swapping.
248
  bool swap_;
249
  // Contents of each element of the traineddata file.
250
  std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
251
};
252
253
} // namespace tesseract
254
255
#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_