/src/tesseract/src/ccutil/tessdatamanager.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | ///////////////////////////////////////////////////////////////////////  | 
2  |  | // File:        tessdatamanager.h  | 
3  |  | // Description: Functions to handle loading/combining tesseract data files.  | 
4  |  | // Author:      Daria Antonova  | 
5  |  | //  | 
6  |  | // (C) Copyright 2009, Google Inc.  | 
7  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
8  |  | // you may not use this file except in compliance with the License.  | 
9  |  | // You may obtain a copy of the License at  | 
10  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
11  |  | // Unless required by applicable law or agreed to in writing, software  | 
12  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
13  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
14  |  | // See the License for the specific language governing permissions and  | 
15  |  | // limitations under the License.  | 
16  |  | //  | 
17  |  | ///////////////////////////////////////////////////////////////////////  | 
18  |  |  | 
19  |  | #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_  | 
20  |  | #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_  | 
21  |  |  | 
22  |  | #include <tesseract/baseapi.h> // FileReader  | 
23  |  | #include <string>              // std::string  | 
24  |  | #include <vector>              // std::vector  | 
25  |  | #include "serialis.h"          // FileWriter  | 
26  |  |  | 
27  |  | static const char kTrainedDataSuffix[] = "traineddata";  | 
28  |  |  | 
29  |  | // When adding new tessdata types and file suffixes, please make sure to  | 
30  |  | // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.  | 
31  |  | static const char kLangConfigFileSuffix[] = "config";  | 
32  |  | static const char kUnicharsetFileSuffix[] = "unicharset";  | 
33  |  | static const char kAmbigsFileSuffix[] = "unicharambigs";  | 
34  |  | static const char kBuiltInTemplatesFileSuffix[] = "inttemp";  | 
35  |  | static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";  | 
36  |  | static const char kNormProtoFileSuffix[] = "normproto";  | 
37  |  | static const char kPuncDawgFileSuffix[] = "punc-dawg";  | 
38  |  | static const char kSystemDawgFileSuffix[] = "word-dawg";  | 
39  |  | static const char kNumberDawgFileSuffix[] = "number-dawg";  | 
40  |  | static const char kFreqDawgFileSuffix[] = "freq-dawg";  | 
41  |  | static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";  | 
42  |  | static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";  | 
43  |  | static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";  | 
44  |  | static const char kShapeTableFileSuffix[] = "shapetable";  | 
45  |  | static const char kBigramDawgFileSuffix[] = "bigram-dawg";  | 
46  |  | static const char kUnambigDawgFileSuffix[] = "unambig-dawg";  | 
47  |  | static const char kParamsModelFileSuffix[] = "params-model";  | 
48  |  | static const char kLSTMModelFileSuffix[] = "lstm";  | 
49  |  | static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";  | 
50  |  | static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";  | 
51  |  | static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";  | 
52  |  | static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";  | 
53  |  | static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";  | 
54  |  | static const char kVersionFileSuffix[] = "version";  | 
55  |  |  | 
56  |  | namespace tesseract { | 
57  |  |  | 
58  |  | enum TessdataType { | 
59  |  |   TESSDATA_LANG_CONFIG,        // 0  | 
60  |  |   TESSDATA_UNICHARSET,         // 1  | 
61  |  |   TESSDATA_AMBIGS,             // 2  | 
62  |  |   TESSDATA_INTTEMP,            // 3  | 
63  |  |   TESSDATA_PFFMTABLE,          // 4  | 
64  |  |   TESSDATA_NORMPROTO,          // 5  | 
65  |  |   TESSDATA_PUNC_DAWG,          // 6  | 
66  |  |   TESSDATA_SYSTEM_DAWG,        // 7  | 
67  |  |   TESSDATA_NUMBER_DAWG,        // 8  | 
68  |  |   TESSDATA_FREQ_DAWG,          // 9  | 
69  |  |   TESSDATA_FIXED_LENGTH_DAWGS, // 10  // deprecated  | 
70  |  |   TESSDATA_CUBE_UNICHARSET,    // 11  // deprecated  | 
71  |  |   TESSDATA_CUBE_SYSTEM_DAWG,   // 12  // deprecated  | 
72  |  |   TESSDATA_SHAPE_TABLE,        // 13  | 
73  |  |   TESSDATA_BIGRAM_DAWG,        // 14  | 
74  |  |   TESSDATA_UNAMBIG_DAWG,       // 15  | 
75  |  |   TESSDATA_PARAMS_MODEL,       // 16  | 
76  |  |   TESSDATA_LSTM,               // 17  | 
77  |  |   TESSDATA_LSTM_PUNC_DAWG,     // 18  | 
78  |  |   TESSDATA_LSTM_SYSTEM_DAWG,   // 19  | 
79  |  |   TESSDATA_LSTM_NUMBER_DAWG,   // 20  | 
80  |  |   TESSDATA_LSTM_UNICHARSET,    // 21  | 
81  |  |   TESSDATA_LSTM_RECODER,       // 22  | 
82  |  |   TESSDATA_VERSION,            // 23  | 
83  |  |  | 
84  |  |   TESSDATA_NUM_ENTRIES  | 
85  |  | };  | 
86  |  |  | 
87  |  | /**  | 
88  |  |  * kTessdataFileSuffixes[i] indicates the file suffix for  | 
89  |  |  * tessdata of type i (from TessdataType enum).  | 
90  |  |  */  | 
91  |  | static const char *const kTessdataFileSuffixes[] = { | 
92  |  |     kLangConfigFileSuffix,       // 0  | 
93  |  |     kUnicharsetFileSuffix,       // 1  | 
94  |  |     kAmbigsFileSuffix,           // 2  | 
95  |  |     kBuiltInTemplatesFileSuffix, // 3  | 
96  |  |     kBuiltInCutoffsFileSuffix,   // 4  | 
97  |  |     kNormProtoFileSuffix,        // 5  | 
98  |  |     kPuncDawgFileSuffix,         // 6  | 
99  |  |     kSystemDawgFileSuffix,       // 7  | 
100  |  |     kNumberDawgFileSuffix,       // 8  | 
101  |  |     kFreqDawgFileSuffix,         // 9  | 
102  |  |     kFixedLengthDawgsFileSuffix, // 10  // deprecated  | 
103  |  |     kCubeUnicharsetFileSuffix,   // 11  // deprecated  | 
104  |  |     kCubeSystemDawgFileSuffix,   // 12  // deprecated  | 
105  |  |     kShapeTableFileSuffix,       // 13  | 
106  |  |     kBigramDawgFileSuffix,       // 14  | 
107  |  |     kUnambigDawgFileSuffix,      // 15  | 
108  |  |     kParamsModelFileSuffix,      // 16  | 
109  |  |     kLSTMModelFileSuffix,        // 17  | 
110  |  |     kLSTMPuncDawgFileSuffix,     // 18  | 
111  |  |     kLSTMSystemDawgFileSuffix,   // 19  | 
112  |  |     kLSTMNumberDawgFileSuffix,   // 20  | 
113  |  |     kLSTMUnicharsetFileSuffix,   // 21  | 
114  |  |     kLSTMRecoderFileSuffix,      // 22  | 
115  |  |     kVersionFileSuffix,          // 23  | 
116  |  | };  | 
117  |  |  | 
118  |  | /**  | 
119  |  |  * TessdataType could be updated to contain more entries, however  | 
120  |  |  * we do not expect that number to be astronomically high.  | 
121  |  |  * In order to automatically detect endianness TessdataManager will  | 
122  |  |  * flip the bits if actual_tessdata_num_entries_ is larger than  | 
123  |  |  * kMaxNumTessdataEntries.  | 
124  |  |  */  | 
125  |  | static const int kMaxNumTessdataEntries = 1000;  | 
126  |  |  | 
127  |  | class TESS_API TessdataManager { | 
128  |  | public:  | 
129  |  |   TessdataManager();  | 
130  |  |   explicit TessdataManager(FileReader reader);  | 
131  |  |  | 
132  | 2  |   ~TessdataManager() = default;  | 
133  |  |  | 
134  | 0  |   bool swap() const { | 
135  | 0  |     return swap_;  | 
136  | 0  |   }  | 
137  | 2  |   bool is_loaded() const { | 
138  | 2  |     return is_loaded_;  | 
139  | 2  |   }  | 
140  |  |  | 
141  |  |   // Lazily loads from the given filename. Won't actually read the file  | 
142  |  |   // until it needs it.  | 
143  |  |   void LoadFileLater(const char *data_file_name);  | 
144  |  |   /**  | 
145  |  |    * Opens and reads the given data file right now.  | 
146  |  |    * @return true on success.  | 
147  |  |    */  | 
148  |  |   bool Init(const char *data_file_name);  | 
149  |  |   // Loads from the given memory buffer as if a file, remembering name as some  | 
150  |  |   // arbitrary source id for caching.  | 
151  |  |   bool LoadMemBuffer(const char *name, const char *data, int size);  | 
152  |  |   // Overwrites a single entry of the given type.  | 
153  |  |   void OverwriteEntry(TessdataType type, const char *data, int size);  | 
154  |  |  | 
155  |  |   // Saves to the given filename.  | 
156  |  |   bool SaveFile(const char *filename, FileWriter writer) const;  | 
157  |  |   // Serializes to the given vector.  | 
158  |  |   void Serialize(std::vector<char> *data) const;  | 
159  |  |   // Resets to the initial state, keeping the reader.  | 
160  |  |   void Clear();  | 
161  |  |  | 
162  |  |   // Prints a directory of contents.  | 
163  |  |   void Directory() const;  | 
164  |  |  | 
165  |  |   // Returns true if the component requested is present.  | 
166  | 6  |   bool IsComponentAvailable(TessdataType type) const { | 
167  | 6  |     return !entries_[type].empty();  | 
168  | 6  |   }  | 
169  |  |   // Opens the given TFile pointer to the given component type.  | 
170  |  |   // Returns false in case of failure.  | 
171  |  |   bool GetComponent(TessdataType type, TFile *fp);  | 
172  |  |   // As non-const version except it can't load the component if not already  | 
173  |  |   // loaded.  | 
174  |  |   bool GetComponent(TessdataType type, TFile *fp) const;  | 
175  |  |  | 
176  |  |   // Returns the current version string.  | 
177  |  |   std::string VersionString() const;  | 
178  |  |   // Sets the version string to the given v_str.  | 
179  |  |   void SetVersionString(const std::string &v_str);  | 
180  |  |  | 
181  |  |   // Returns true if the base Tesseract components are present.  | 
182  | 2  |   bool IsBaseAvailable() const { | 
183  | 2  |     return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();  | 
184  | 2  |   }  | 
185  |  |  | 
186  |  |   // Returns true if the LSTM components are present.  | 
187  | 2  |   bool IsLSTMAvailable() const { | 
188  | 2  |     return !entries_[TESSDATA_LSTM].empty();  | 
189  | 2  |   }  | 
190  |  |  | 
191  |  |   // Return the name of the underlying data file.  | 
192  | 18  |   const std::string &GetDataFileName() const { | 
193  | 18  |     return data_file_name_;  | 
194  | 18  |   }  | 
195  |  |  | 
196  |  |   /**  | 
197  |  |    * Reads all the standard tesseract config and data files for a language  | 
198  |  |    * at the given path and bundles them up into one binary data file.  | 
199  |  |    * Returns true if the combined traineddata file was successfully written.  | 
200  |  |    */  | 
201  |  |   bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);  | 
202  |  |  | 
203  |  |   /**  | 
204  |  |    * Gets the individual components from the data_file_ with which the class was  | 
205  |  |    * initialized. Overwrites the components specified by component_filenames.  | 
206  |  |    * Writes the updated traineddata file to new_traineddata_filename.  | 
207  |  |    */  | 
208  |  |   bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,  | 
209  |  |                            int num_new_components);  | 
210  |  |  | 
211  |  |   /**  | 
212  |  |    * Extracts tessdata component implied by the name of the input file from  | 
213  |  |    * the combined traineddata loaded into TessdataManager.  | 
214  |  |    * Writes the extracted component to the file indicated by the file name.  | 
215  |  |    * E.g. if the filename given is somepath/somelang.unicharset, unicharset  | 
216  |  |    * will be extracted from the data loaded into the TessdataManager and will  | 
217  |  |    * be written to somepath/somelang.unicharset.  | 
218  |  |    * @return true if the component was successfully extracted, false if the  | 
219  |  |    * component was not present in the traineddata loaded into TessdataManager.  | 
220  |  |    */  | 
221  |  |   bool ExtractToFile(const char *filename);  | 
222  |  |  | 
223  |  | private:  | 
224  |  |   // Use libarchive.  | 
225  |  |   bool LoadArchiveFile(const char *filename);  | 
226  |  |  | 
227  |  |   /**  | 
228  |  |    * Fills type with TessdataType of the tessdata component represented by the  | 
229  |  |    * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.  | 
230  |  |    * @return true if the tessdata component type could be determined  | 
231  |  |    * from the given file name.  | 
232  |  |    */  | 
233  |  |   static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);  | 
234  |  |  | 
235  |  |   /**  | 
236  |  |    * Tries to determine tessdata component file suffix from filename,  | 
237  |  |    * returns true on success.  | 
238  |  |    */  | 
239  |  |   static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);  | 
240  |  |  | 
241  |  |   // Name of file it came from.  | 
242  |  |   std::string data_file_name_;  | 
243  |  |   // Function to load the file when we need it.  | 
244  |  |   FileReader reader_;  | 
245  |  |   // True if the file has been loaded.  | 
246  |  |   bool is_loaded_;  | 
247  |  |   // True if the bytes need swapping.  | 
248  |  |   bool swap_;  | 
249  |  |   // Contents of each element of the traineddata file.  | 
250  |  |   std::vector<char> entries_[TESSDATA_NUM_ENTRIES];  | 
251  |  | };  | 
252  |  |  | 
253  |  | } // namespace tesseract  | 
254  |  |  | 
255  |  | #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_  |