/src/tesseract/src/ccstruct/imagedata.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: imagedata.h |
3 | | // Description: Class to hold information about a single image and its |
4 | | // corresponding boxes or text file. |
5 | | // Author: Ray Smith |
6 | | // |
7 | | // (C) Copyright 2013, Google Inc. |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | // you may not use this file except in compliance with the License. |
10 | | // You may obtain a copy of the License at |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // Unless required by applicable law or agreed to in writing, software |
13 | | // distributed under the License is distributed on an "AS IS" BASIS, |
14 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | // See the License for the specific language governing permissions and |
16 | | // limitations under the License. |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TESSERACT_IMAGE_IMAGEDATA_H_ |
20 | | #define TESSERACT_IMAGE_IMAGEDATA_H_ |
21 | | |
22 | | #include "image.h" |
23 | | #include "points.h" // for FCOORD |
24 | | |
25 | | #include <mutex> // for std::mutex |
26 | | #include <thread> // for std::thread |
27 | | |
28 | | struct Pix; |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | class TFile; |
33 | | class ScrollView; |
34 | | class TBOX; |
35 | | |
36 | | // Amount of padding to apply in output pixels in feature mode. |
37 | | const int kFeaturePadding = 2; |
38 | | // Number of pixels to pad around text boxes. |
39 | | const int kImagePadding = 4; |
40 | | |
41 | | // Enum to determine the caching and data sequencing strategy. |
42 | | enum CachingStrategy { |
43 | | // Reads all of one file before moving on to the next. Requires samples to be |
44 | | // shuffled across files. Uses the count of samples in the first file as |
45 | | // the count in all the files to achieve high-speed random access. As a |
46 | | // consequence, if subsequent files are smaller, they get entries used more |
47 | | // than once, and if subsequent files are larger, some entries are not used. |
48 | | // Best for larger data sets that don't fit in memory. |
49 | | CS_SEQUENTIAL, |
50 | | // Reads one sample from each file in rotation. Does not require shuffled |
51 | | // samples, but is extremely disk-intensive. Samples in smaller files also |
52 | | // get used more often than samples in larger files. |
53 | | // Best for smaller data sets that mostly fit in memory. |
54 | | CS_ROUND_ROBIN, |
55 | | }; |
56 | | |
57 | | // Class to hold information on a single image: |
58 | | // Filename, cached image as a Pix*, character boxes, text transcription. |
59 | | // The text transcription is the ground truth UTF-8 text for the image. |
60 | | // Character boxes are optional and indicate the desired segmentation of |
61 | | // the text into recognition units. |
62 | | class TESS_API ImageData { |
63 | | public: |
64 | | ImageData(); |
65 | | // Takes ownership of the pix. |
66 | | ImageData(bool vertical, Image pix); |
67 | | ~ImageData(); |
68 | | |
69 | | // Builds and returns an ImageData from the basic data. Note that imagedata, |
70 | | // truth_text, and box_text are all the actual file data, NOT filenames. |
71 | | static ImageData *Build(const char *name, int page_number, const char *lang, |
72 | | const char *imagedata, int imagedatasize, const char *truth_text, |
73 | | const char *box_text); |
74 | | |
75 | | // Writes to the given file. Returns false in case of error. |
76 | | bool Serialize(TFile *fp) const; |
77 | | // Reads from the given file. Returns false in case of error. |
78 | | bool DeSerialize(TFile *fp); |
79 | | // As DeSerialize, but only seeks past the data - hence a static method. |
80 | | static bool SkipDeSerialize(TFile *fp); |
81 | | |
82 | | // Other accessors. |
83 | 0 | const std::string &imagefilename() const { |
84 | 0 | return imagefilename_; |
85 | 0 | } |
86 | 0 | void set_imagefilename(const std::string &name) { |
87 | 0 | imagefilename_ = name; |
88 | 0 | } |
89 | 0 | int page_number() const { |
90 | 0 | return page_number_; |
91 | 0 | } |
92 | 0 | void set_page_number(int num) { |
93 | 0 | page_number_ = num; |
94 | 0 | } |
95 | 0 | const std::vector<char> &image_data() const { |
96 | 0 | return image_data_; |
97 | 0 | } |
98 | 0 | const std::string &language() const { |
99 | 0 | return language_; |
100 | 0 | } |
101 | 0 | void set_language(const std::string &lang) { |
102 | 0 | language_ = lang; |
103 | 0 | } |
104 | 0 | const std::string &transcription() const { |
105 | 0 | return transcription_; |
106 | 0 | } |
107 | 0 | const std::vector<TBOX> &boxes() const { |
108 | 0 | return boxes_; |
109 | 0 | } |
110 | 0 | const std::vector<std::string> &box_texts() const { |
111 | 0 | return box_texts_; |
112 | 0 | } |
113 | 0 | const std::string &box_text(int index) const { |
114 | 0 | return box_texts_[index]; |
115 | 0 | } |
116 | | // Saves the given Pix as a PNG-encoded string and destroys it. |
117 | | // In case of missing PNG support in Leptonica use PNM format, |
118 | | // which requires more memory. |
119 | | void SetPix(Image pix); |
120 | | // Returns the Pix image for *this. Must be pixDestroyed after use. |
121 | | Image GetPix() const; |
122 | | // Gets anything and everything with a non-nullptr pointer, prescaled to a |
123 | | // given target_height (if 0, then the original image height), and aligned. |
124 | | // Also returns (if not nullptr) the width and height of the scaled image. |
125 | | // The return value is the scaled Pix, which must be pixDestroyed after use, |
126 | | // and scale_factor (if not nullptr) is set to the scale factor that was |
127 | | // applied to the image to achieve the target_height. |
128 | | Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, |
129 | | int *scaled_height, std::vector<TBOX> *boxes) const; |
130 | | |
131 | | int MemoryUsed() const; |
132 | | |
133 | | // Draws the data in a new window. |
134 | | void Display() const; |
135 | | |
136 | | // Adds the supplied boxes and transcriptions that correspond to the correct |
137 | | // page number. |
138 | | void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, |
139 | | const std::vector<int> &box_pages); |
140 | | |
141 | | private: |
142 | | // Saves the given Pix as a PNG-encoded string and destroys it. |
143 | | // In case of missing PNG support in Leptonica use PNM format, |
144 | | // which requires more memory. |
145 | | static void SetPixInternal(Image pix, std::vector<char> *image_data); |
146 | | // Returns the Pix image for the image_data. Must be pixDestroyed after use. |
147 | | static Image GetPixInternal(const std::vector<char> &image_data); |
148 | | // Parses the text string as a box file and adds any discovered boxes that |
149 | | // match the page number. Returns false on error. |
150 | | bool AddBoxes(const char *box_text); |
151 | | |
152 | | private: |
153 | | std::string imagefilename_; // File to read image from. |
154 | | int32_t page_number_; // Page number if multi-page tif or -1. |
155 | | // see https://github.com/tesseract-ocr/tesseract/pull/2965 |
156 | | // EP: reconsider for tess6.0/opencv |
157 | | #ifdef TESSERACT_IMAGEDATA_AS_PIX |
158 | | Image internal_pix_; |
159 | | #endif |
160 | | std::vector<char> image_data_; // PNG/PNM file data. |
161 | | std::string language_; // Language code for image. |
162 | | std::string transcription_; // UTF-8 ground truth of image. |
163 | | std::vector<TBOX> boxes_; // If non-empty boxes of the image. |
164 | | std::vector<std::string> box_texts_; // String for text in each box. |
165 | | bool vertical_text_; // Image has been rotated from vertical. |
166 | | }; |
167 | | |
168 | | // A collection of ImageData that knows roughly how much memory it is using. |
169 | | class DocumentData { |
170 | | public: |
171 | | TESS_API |
172 | | explicit DocumentData(const std::string &name); |
173 | | TESS_API |
174 | | ~DocumentData(); |
175 | | |
176 | | // Reads all the pages in the given lstmf filename to the cache. The reader |
177 | | // is used to read the file. |
178 | | TESS_API |
179 | | bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader); |
180 | | // Sets up the document, without actually loading it. |
181 | | void SetDocument(const char *filename, int64_t max_memory, FileReader reader); |
182 | | // Writes all the pages to the given filename. Returns false on error. |
183 | | TESS_API |
184 | | bool SaveDocument(const char *filename, FileWriter writer); |
185 | | |
186 | | // Adds the given page data to this document, counting up memory. |
187 | | TESS_API |
188 | | void AddPageToDocument(ImageData *page); |
189 | | |
190 | 0 | const std::string &document_name() const { |
191 | 0 | std::lock_guard<std::mutex> lock(general_mutex_); |
192 | 0 | return document_name_; |
193 | 0 | } |
194 | 0 | int NumPages() const { |
195 | 0 | std::lock_guard<std::mutex> lock(general_mutex_); |
196 | 0 | return total_pages_; |
197 | 0 | } |
198 | 0 | size_t PagesSize() const { |
199 | 0 | return pages_.size(); |
200 | 0 | } |
201 | 0 | int64_t memory_used() const { |
202 | 0 | std::lock_guard<std::mutex> lock(general_mutex_); |
203 | 0 | return memory_used_; |
204 | 0 | } |
205 | | // If the given index is not currently loaded, loads it using a separate |
206 | | // thread. Note: there are 4 cases: |
207 | | // Document uncached: IsCached() returns false, total_pages_ < 0. |
208 | | // Required page is available: IsPageAvailable returns true. In this case, |
209 | | // total_pages_ > 0 and |
210 | | // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size() |
211 | | // Pages are loaded, but the required one is not. |
212 | | // The requested page is being loaded by LoadPageInBackground. In this case, |
213 | | // index == pages_offset_. Once the loading starts, the pages lock is held |
214 | | // until it completes, at which point IsPageAvailable will unblock and return |
215 | | // true. |
216 | | void LoadPageInBackground(int index); |
217 | | // Returns a pointer to the page with the given index, modulo the total |
218 | | // number of pages. Blocks until the background load is completed. |
219 | | TESS_API |
220 | | const ImageData *GetPage(int index); |
221 | | // Returns true if the requested page is available, and provides a pointer, |
222 | | // which may be nullptr if the document is empty. May block, even though it |
223 | | // doesn't guarantee to return true. |
224 | | bool IsPageAvailable(int index, ImageData **page); |
225 | | // Takes ownership of the given page index. The page is made nullptr in *this. |
226 | 0 | ImageData *TakePage(int index) { |
227 | 0 | std::lock_guard<std::mutex> lock(pages_mutex_); |
228 | 0 | ImageData *page = pages_[index]; |
229 | 0 | pages_[index] = nullptr; |
230 | 0 | return page; |
231 | 0 | } |
232 | | // Returns true if the document is currently loaded or in the process of |
233 | | // loading. |
234 | 0 | bool IsCached() const { |
235 | 0 | return NumPages() >= 0; |
236 | 0 | } |
237 | | // Removes all pages from memory and frees the memory, but does not forget |
238 | | // the document metadata. Returns the memory saved. |
239 | | int64_t UnCache(); |
240 | | // Shuffles all the pages in the document. |
241 | | void Shuffle(); |
242 | | |
243 | | private: |
244 | | // Sets the value of total_pages_ behind a mutex. |
245 | 0 | void set_total_pages(int total) { |
246 | 0 | std::lock_guard<std::mutex> lock(general_mutex_); |
247 | 0 | total_pages_ = total; |
248 | 0 | } |
249 | 0 | void set_memory_used(int64_t memory_used) { |
250 | 0 | std::lock_guard<std::mutex> lock(general_mutex_); |
251 | 0 | memory_used_ = memory_used; |
252 | 0 | } |
253 | | // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_ |
254 | | // starting at index pages_offset_. |
255 | | bool ReCachePages(); |
256 | | |
257 | | private: |
258 | | // A name for this document. |
259 | | std::string document_name_; |
260 | | // A group of pages that corresponds in some loose way to a document. |
261 | | std::vector<ImageData *> pages_; |
262 | | // Page number of the first index in pages_. |
263 | | int pages_offset_; |
264 | | // Total number of pages in document (may exceed size of pages_.) |
265 | | int total_pages_; |
266 | | // Total of all pix sizes in the document. |
267 | | int64_t memory_used_; |
268 | | // Max memory to use at any time. |
269 | | int64_t max_memory_; |
270 | | // Saved reader from LoadDocument to allow re-caching. |
271 | | FileReader reader_; |
272 | | // Mutex that protects pages_ and pages_offset_ against multiple parallel |
273 | | // loads, and provides a wait for page. |
274 | | std::mutex pages_mutex_; |
275 | | // Mutex that protects other data members that callers want to access without |
276 | | // waiting for a load operation. |
277 | | mutable std::mutex general_mutex_; |
278 | | |
279 | | // Thread which loads document. |
280 | | std::thread thread; |
281 | | }; |
282 | | |
283 | | // A collection of DocumentData that knows roughly how much memory it is using. |
284 | | // Note that while it supports background read-ahead, it assumes that a single |
285 | | // thread is accessing documents, ie it is not safe for multiple threads to |
286 | | // access different documents in parallel, as one may de-cache the other's |
287 | | // content. |
288 | | class DocumentCache { |
289 | | public: |
290 | | TESS_API |
291 | | explicit DocumentCache(int64_t max_memory); |
292 | | TESS_API |
293 | | ~DocumentCache(); |
294 | | |
295 | | // Deletes all existing documents from the cache. |
296 | 0 | void Clear() { |
297 | 0 | for (auto *document : documents_) { |
298 | 0 | delete document; |
299 | 0 | } |
300 | 0 | documents_.clear(); |
301 | 0 | num_pages_per_doc_ = 0; |
302 | 0 | } |
303 | | // Adds all the documents in the list of filenames, counting memory. |
304 | | // The reader is used to read the files. |
305 | | TESS_API |
306 | | bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy, |
307 | | FileReader reader); |
308 | | |
309 | | // Adds document to the cache. |
310 | | bool AddToCache(DocumentData *data); |
311 | | |
312 | | // Finds and returns a document by name. |
313 | | DocumentData *FindDocument(const std::string &document_name) const; |
314 | | |
315 | | // Returns a page by serial number using the current cache_strategy_ to |
316 | | // determine the mapping from serial number to page. |
317 | 0 | const ImageData *GetPageBySerial(int serial) { |
318 | 0 | if (cache_strategy_ == CS_SEQUENTIAL) { |
319 | 0 | return GetPageSequential(serial); |
320 | 0 | } else { |
321 | 0 | return GetPageRoundRobin(serial); |
322 | 0 | } |
323 | 0 | } |
324 | | |
325 | 0 | const std::vector<DocumentData *> &documents() const { |
326 | 0 | return documents_; |
327 | 0 | } |
328 | | // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache |
329 | | // strategy, could take a long time. |
330 | | TESS_API |
331 | | int TotalPages(); |
332 | | |
333 | | private: |
334 | | // Returns a page by serial number, selecting them in a round-robin fashion |
335 | | // from all the documents. Highly disk-intensive, but doesn't need samples |
336 | | // to be shuffled between files to begin with. |
337 | | TESS_API |
338 | | const ImageData *GetPageRoundRobin(int serial); |
339 | | // Returns a page by serial number, selecting them in sequence from each file. |
340 | | // Requires the samples to be shuffled between the files to give a random or |
341 | | // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. |
342 | | TESS_API |
343 | | const ImageData *GetPageSequential(int serial); |
344 | | |
345 | | // Helper counts the number of adjacent cached neighbour documents_ of index |
346 | | // looking in direction dir, ie index+dir, index+2*dir etc. |
347 | | int CountNeighbourDocs(int index, int dir); |
348 | | |
349 | | // A group of pages that corresponds in some loose way to a document. |
350 | | std::vector<DocumentData *> documents_; |
351 | | // Strategy to use for caching and serializing data samples. |
352 | | CachingStrategy cache_strategy_ = CS_SEQUENTIAL; |
353 | | // Number of pages in the first document, used as a divisor in |
354 | | // GetPageSequential to determine the document index. |
355 | | int num_pages_per_doc_ = 0; |
356 | | // Max memory allowed in this cache. |
357 | | int64_t max_memory_ = 0; |
358 | | }; |
359 | | |
360 | | } // namespace tesseract |
361 | | |
362 | | #endif // TESSERACT_IMAGE_IMAGEDATA_H_ |