/src/tesseract/src/ccutil/tessdatamanager.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: tessdatamanager.h |
3 | | // Description: Functions to handle loading/combining tesseract data files. |
4 | | // Author: Daria Antonova |
5 | | // |
6 | | // (C) Copyright 2009, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ |
20 | | #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ |
21 | | |
22 | | #include <tesseract/baseapi.h> // FileReader |
23 | | #include <string> // std::string |
24 | | #include <vector> // std::vector |
25 | | #include "serialis.h" // FileWriter |
26 | | |
27 | | static const char kTrainedDataSuffix[] = "traineddata"; |
28 | | |
29 | | // When adding new tessdata types and file suffixes, please make sure to |
30 | | // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. |
31 | | static const char kLangConfigFileSuffix[] = "config"; |
32 | | static const char kUnicharsetFileSuffix[] = "unicharset"; |
33 | | static const char kAmbigsFileSuffix[] = "unicharambigs"; |
34 | | static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; |
35 | | static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; |
36 | | static const char kNormProtoFileSuffix[] = "normproto"; |
37 | | static const char kPuncDawgFileSuffix[] = "punc-dawg"; |
38 | | static const char kSystemDawgFileSuffix[] = "word-dawg"; |
39 | | static const char kNumberDawgFileSuffix[] = "number-dawg"; |
40 | | static const char kFreqDawgFileSuffix[] = "freq-dawg"; |
41 | | static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; |
42 | | static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; |
43 | | static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; |
44 | | static const char kShapeTableFileSuffix[] = "shapetable"; |
45 | | static const char kBigramDawgFileSuffix[] = "bigram-dawg"; |
46 | | static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; |
47 | | static const char kParamsModelFileSuffix[] = "params-model"; |
48 | | static const char kLSTMModelFileSuffix[] = "lstm"; |
49 | | static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg"; |
50 | | static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg"; |
51 | | static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg"; |
52 | | static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset"; |
53 | | static const char kLSTMRecoderFileSuffix[] = "lstm-recoder"; |
54 | | static const char kVersionFileSuffix[] = "version"; |
55 | | |
56 | | namespace tesseract { |
57 | | |
58 | | enum TessdataType { |
59 | | TESSDATA_LANG_CONFIG, // 0 |
60 | | TESSDATA_UNICHARSET, // 1 |
61 | | TESSDATA_AMBIGS, // 2 |
62 | | TESSDATA_INTTEMP, // 3 |
63 | | TESSDATA_PFFMTABLE, // 4 |
64 | | TESSDATA_NORMPROTO, // 5 |
65 | | TESSDATA_PUNC_DAWG, // 6 |
66 | | TESSDATA_SYSTEM_DAWG, // 7 |
67 | | TESSDATA_NUMBER_DAWG, // 8 |
68 | | TESSDATA_FREQ_DAWG, // 9 |
69 | | TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated |
70 | | TESSDATA_CUBE_UNICHARSET, // 11 // deprecated |
71 | | TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated |
72 | | TESSDATA_SHAPE_TABLE, // 13 |
73 | | TESSDATA_BIGRAM_DAWG, // 14 |
74 | | TESSDATA_UNAMBIG_DAWG, // 15 |
75 | | TESSDATA_PARAMS_MODEL, // 16 |
76 | | TESSDATA_LSTM, // 17 |
77 | | TESSDATA_LSTM_PUNC_DAWG, // 18 |
78 | | TESSDATA_LSTM_SYSTEM_DAWG, // 19 |
79 | | TESSDATA_LSTM_NUMBER_DAWG, // 20 |
80 | | TESSDATA_LSTM_UNICHARSET, // 21 |
81 | | TESSDATA_LSTM_RECODER, // 22 |
82 | | TESSDATA_VERSION, // 23 |
83 | | |
84 | | TESSDATA_NUM_ENTRIES |
85 | | }; |
86 | | |
87 | | /** |
88 | | * kTessdataFileSuffixes[i] indicates the file suffix for |
89 | | * tessdata of type i (from TessdataType enum). |
90 | | */ |
91 | | static const char *const kTessdataFileSuffixes[] = { |
92 | | kLangConfigFileSuffix, // 0 |
93 | | kUnicharsetFileSuffix, // 1 |
94 | | kAmbigsFileSuffix, // 2 |
95 | | kBuiltInTemplatesFileSuffix, // 3 |
96 | | kBuiltInCutoffsFileSuffix, // 4 |
97 | | kNormProtoFileSuffix, // 5 |
98 | | kPuncDawgFileSuffix, // 6 |
99 | | kSystemDawgFileSuffix, // 7 |
100 | | kNumberDawgFileSuffix, // 8 |
101 | | kFreqDawgFileSuffix, // 9 |
102 | | kFixedLengthDawgsFileSuffix, // 10 // deprecated |
103 | | kCubeUnicharsetFileSuffix, // 11 // deprecated |
104 | | kCubeSystemDawgFileSuffix, // 12 // deprecated |
105 | | kShapeTableFileSuffix, // 13 |
106 | | kBigramDawgFileSuffix, // 14 |
107 | | kUnambigDawgFileSuffix, // 15 |
108 | | kParamsModelFileSuffix, // 16 |
109 | | kLSTMModelFileSuffix, // 17 |
110 | | kLSTMPuncDawgFileSuffix, // 18 |
111 | | kLSTMSystemDawgFileSuffix, // 19 |
112 | | kLSTMNumberDawgFileSuffix, // 20 |
113 | | kLSTMUnicharsetFileSuffix, // 21 |
114 | | kLSTMRecoderFileSuffix, // 22 |
115 | | kVersionFileSuffix, // 23 |
116 | | }; |
117 | | |
118 | | /** |
119 | | * TessdataType could be updated to contain more entries, however |
120 | | * we do not expect that number to be astronomically high. |
121 | | * In order to automatically detect endianness TessdataManager will |
122 | | * flip the bits if actual_tessdata_num_entries_ is larger than |
123 | | * kMaxNumTessdataEntries. |
124 | | */ |
125 | | static const int kMaxNumTessdataEntries = 1000; |
126 | | |
127 | | class TESS_API TessdataManager { |
128 | | public: |
129 | | TessdataManager(); |
130 | | explicit TessdataManager(FileReader reader); |
131 | | |
132 | 2 | ~TessdataManager() = default; |
133 | | |
134 | 0 | bool swap() const { |
135 | 0 | return swap_; |
136 | 0 | } |
137 | 2 | bool is_loaded() const { |
138 | 2 | return is_loaded_; |
139 | 2 | } |
140 | | |
141 | | // Lazily loads from the given filename. Won't actually read the file |
142 | | // until it needs it. |
143 | | void LoadFileLater(const char *data_file_name); |
144 | | /** |
145 | | * Opens and reads the given data file right now. |
146 | | * @return true on success. |
147 | | */ |
148 | | bool Init(const char *data_file_name); |
149 | | // Loads from the given memory buffer as if a file, remembering name as some |
150 | | // arbitrary source id for caching. |
151 | | bool LoadMemBuffer(const char *name, const char *data, int size); |
152 | | // Overwrites a single entry of the given type. |
153 | | void OverwriteEntry(TessdataType type, const char *data, int size); |
154 | | |
155 | | // Saves to the given filename. |
156 | | bool SaveFile(const char *filename, FileWriter writer) const; |
157 | | // Serializes to the given vector. |
158 | | void Serialize(std::vector<char> *data) const; |
159 | | // Resets to the initial state, keeping the reader. |
160 | | void Clear(); |
161 | | |
162 | | // Prints a directory of contents. |
163 | | void Directory() const; |
164 | | |
165 | | // Returns true if the component requested is present. |
166 | 6 | bool IsComponentAvailable(TessdataType type) const { |
167 | 6 | return !entries_[type].empty(); |
168 | 6 | } |
169 | | // Opens the given TFile pointer to the given component type. |
170 | | // Returns false in case of failure. |
171 | | bool GetComponent(TessdataType type, TFile *fp); |
172 | | // As non-const version except it can't load the component if not already |
173 | | // loaded. |
174 | | bool GetComponent(TessdataType type, TFile *fp) const; |
175 | | |
176 | | // Returns the current version string. |
177 | | std::string VersionString() const; |
178 | | // Sets the version string to the given v_str. |
179 | | void SetVersionString(const std::string &v_str); |
180 | | |
181 | | // Returns true if the base Tesseract components are present. |
182 | 2 | bool IsBaseAvailable() const { |
183 | 2 | return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty(); |
184 | 2 | } |
185 | | |
186 | | // Returns true if the LSTM components are present. |
187 | 2 | bool IsLSTMAvailable() const { |
188 | 2 | return !entries_[TESSDATA_LSTM].empty(); |
189 | 2 | } |
190 | | |
191 | | // Return the name of the underlying data file. |
192 | 18 | const std::string &GetDataFileName() const { |
193 | 18 | return data_file_name_; |
194 | 18 | } |
195 | | |
196 | | /** |
197 | | * Reads all the standard tesseract config and data files for a language |
198 | | * at the given path and bundles them up into one binary data file. |
199 | | * Returns true if the combined traineddata file was successfully written. |
200 | | */ |
201 | | bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename); |
202 | | |
203 | | /** |
204 | | * Gets the individual components from the data_file_ with which the class was |
205 | | * initialized. Overwrites the components specified by component_filenames. |
206 | | * Writes the updated traineddata file to new_traineddata_filename. |
207 | | */ |
208 | | bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, |
209 | | int num_new_components); |
210 | | |
211 | | /** |
212 | | * Extracts tessdata component implied by the name of the input file from |
213 | | * the combined traineddata loaded into TessdataManager. |
214 | | * Writes the extracted component to the file indicated by the file name. |
215 | | * E.g. if the filename given is somepath/somelang.unicharset, unicharset |
216 | | * will be extracted from the data loaded into the TessdataManager and will |
217 | | * be written to somepath/somelang.unicharset. |
218 | | * @return true if the component was successfully extracted, false if the |
219 | | * component was not present in the traineddata loaded into TessdataManager. |
220 | | */ |
221 | | bool ExtractToFile(const char *filename); |
222 | | |
223 | | private: |
224 | | // Use libarchive. |
225 | | bool LoadArchiveFile(const char *filename); |
226 | | |
227 | | /** |
228 | | * Fills type with TessdataType of the tessdata component represented by the |
229 | | * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. |
230 | | * @return true if the tessdata component type could be determined |
231 | | * from the given file name. |
232 | | */ |
233 | | static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type); |
234 | | |
235 | | /** |
236 | | * Tries to determine tessdata component file suffix from filename, |
237 | | * returns true on success. |
238 | | */ |
239 | | static bool TessdataTypeFromFileName(const char *filename, TessdataType *type); |
240 | | |
241 | | // Name of file it came from. |
242 | | std::string data_file_name_; |
243 | | // Function to load the file when we need it. |
244 | | FileReader reader_; |
245 | | // True if the file has been loaded. |
246 | | bool is_loaded_; |
247 | | // True if the bytes need swapping. |
248 | | bool swap_; |
249 | | // Contents of each element of the traineddata file. |
250 | | std::vector<char> entries_[TESSDATA_NUM_ENTRIES]; |
251 | | }; |
252 | | |
253 | | } // namespace tesseract |
254 | | |
255 | | #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_ |