Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccstruct/imagedata.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        imagedata.h
3
// Description: Class to hold information about a single image and its
4
//              corresponding boxes or text file.
5
// Author:      Ray Smith
6
//
7
// (C) Copyright 2013, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
20
#define TESSERACT_IMAGE_IMAGEDATA_H_
21
22
#include "image.h"
23
#include "points.h" // for FCOORD
24
25
#include <mutex>  // for std::mutex
26
#include <thread> // for std::thread
27
28
struct Pix;
29
30
namespace tesseract {
31
32
class TFile;
33
class ScrollView;
34
class TBOX;
35
36
// Amount of padding to apply in output pixels in feature mode.
37
const int kFeaturePadding = 2;
38
// Number of pixels to pad around text boxes.
39
const int kImagePadding = 4;
40
41
// Enum to determine the caching and data sequencing strategy.
42
enum CachingStrategy {
43
  // Reads all of one file before moving on to the next. Requires samples to be
44
  // shuffled across files. Uses the count of samples in the first file as
45
  // the count in all the files to achieve high-speed random access. As a
46
  // consequence, if subsequent files are smaller, they get entries used more
47
  // than once, and if subsequent files are larger, some entries are not used.
48
  // Best for larger data sets that don't fit in memory.
49
  CS_SEQUENTIAL,
50
  // Reads one sample from each file in rotation. Does not require shuffled
51
  // samples, but is extremely disk-intensive. Samples in smaller files also
52
  // get used more often than samples in larger files.
53
  // Best for smaller data sets that mostly fit in memory.
54
  CS_ROUND_ROBIN,
55
};
56
57
// Class to hold information on a single image:
58
// Filename, cached image as a Pix*, character boxes, text transcription.
59
// The text transcription is the ground truth UTF-8 text for the image.
60
// Character boxes are optional and indicate the desired segmentation of
61
// the text into recognition units.
62
class TESS_API ImageData {
63
public:
64
  ImageData();
65
  // Takes ownership of the pix.
66
  ImageData(bool vertical, Image pix);
67
  ~ImageData();
68
69
  // Builds and returns an ImageData from the basic data. Note that imagedata,
70
  // truth_text, and box_text are all the actual file data, NOT filenames.
71
  static ImageData *Build(const char *name, int page_number, const char *lang,
72
                          const char *imagedata, int imagedatasize, const char *truth_text,
73
                          const char *box_text);
74
75
  // Writes to the given file. Returns false in case of error.
76
  bool Serialize(TFile *fp) const;
77
  // Reads from the given file. Returns false in case of error.
78
  bool DeSerialize(TFile *fp);
79
  // As DeSerialize, but only seeks past the data - hence a static method.
80
  static bool SkipDeSerialize(TFile *fp);
81
82
  // Other accessors.
83
0
  const std::string &imagefilename() const {
84
0
    return imagefilename_;
85
0
  }
86
0
  void set_imagefilename(const std::string &name) {
87
0
    imagefilename_ = name;
88
0
  }
89
0
  int page_number() const {
90
0
    return page_number_;
91
0
  }
92
0
  void set_page_number(int num) {
93
0
    page_number_ = num;
94
0
  }
95
0
  const std::vector<char> &image_data() const {
96
0
    return image_data_;
97
0
  }
98
0
  const std::string &language() const {
99
0
    return language_;
100
0
  }
101
0
  void set_language(const std::string &lang) {
102
0
    language_ = lang;
103
0
  }
104
0
  const std::string &transcription() const {
105
0
    return transcription_;
106
0
  }
107
0
  const std::vector<TBOX> &boxes() const {
108
0
    return boxes_;
109
0
  }
110
0
  const std::vector<std::string> &box_texts() const {
111
0
    return box_texts_;
112
0
  }
113
0
  const std::string &box_text(int index) const {
114
0
    return box_texts_[index];
115
0
  }
116
  // Saves the given Pix as a PNG-encoded string and destroys it.
117
  // In case of missing PNG support in Leptonica use PNM format,
118
  // which requires more memory.
119
  void SetPix(Image pix);
120
  // Returns the Pix image for *this. Must be pixDestroyed after use.
121
  Image GetPix() const;
122
  // Gets anything and everything with a non-nullptr pointer, prescaled to a
123
  // given target_height (if 0, then the original image height), and aligned.
124
  // Also returns (if not nullptr) the width and height of the scaled image.
125
  // The return value is the scaled Pix, which must be pixDestroyed after use,
126
  // and scale_factor (if not nullptr) is set to the scale factor that was
127
  // applied to the image to achieve the target_height.
128
  Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width,
129
                int *scaled_height, std::vector<TBOX> *boxes) const;
130
131
  int MemoryUsed() const;
132
133
  // Draws the data in a new window.
134
  void Display() const;
135
136
  // Adds the supplied boxes and transcriptions that correspond to the correct
137
  // page number.
138
  void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
139
                const std::vector<int> &box_pages);
140
141
private:
142
  // Saves the given Pix as a PNG-encoded string and destroys it.
143
  // In case of missing PNG support in Leptonica use PNM format,
144
  // which requires more memory.
145
  static void SetPixInternal(Image pix, std::vector<char> *image_data);
146
  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
147
  static Image GetPixInternal(const std::vector<char> &image_data);
148
  // Parses the text string as a box file and adds any discovered boxes that
149
  // match the page number. Returns false on error.
150
  bool AddBoxes(const char *box_text);
151
152
private:
153
  std::string imagefilename_; // File to read image from.
154
  int32_t page_number_;  // Page number if multi-page tif or -1.
155
  // see https://github.com/tesseract-ocr/tesseract/pull/2965
156
  // EP: reconsider for tess6.0/opencv
157
#ifdef TESSERACT_IMAGEDATA_AS_PIX
158
  Image internal_pix_;
159
#endif
160
  std::vector<char> image_data_;  // PNG/PNM file data.
161
  std::string language_;          // Language code for image.
162
  std::string transcription_;     // UTF-8 ground truth of image.
163
  std::vector<TBOX> boxes_;       // If non-empty boxes of the image.
164
  std::vector<std::string> box_texts_; // String for text in each box.
165
  bool vertical_text_;            // Image has been rotated from vertical.
166
};
167
168
// A collection of ImageData that knows roughly how much memory it is using.
169
class DocumentData {
170
public:
171
  TESS_API
172
  explicit DocumentData(const std::string &name);
173
  TESS_API
174
  ~DocumentData();
175
176
  // Reads all the pages in the given lstmf filename to the cache. The reader
177
  // is used to read the file.
178
  TESS_API
179
  bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);
180
  // Sets up the document, without actually loading it.
181
  void SetDocument(const char *filename, int64_t max_memory, FileReader reader);
182
  // Writes all the pages to the given filename. Returns false on error.
183
  TESS_API
184
  bool SaveDocument(const char *filename, FileWriter writer);
185
186
  // Adds the given page data to this document, counting up memory.
187
  TESS_API
188
  void AddPageToDocument(ImageData *page);
189
190
0
  const std::string &document_name() const {
191
0
    std::lock_guard<std::mutex> lock(general_mutex_);
192
0
    return document_name_;
193
0
  }
194
0
  int NumPages() const {
195
0
    std::lock_guard<std::mutex> lock(general_mutex_);
196
0
    return total_pages_;
197
0
  }
198
0
  size_t PagesSize() const {
199
0
    return pages_.size();
200
0
  }
201
0
  int64_t memory_used() const {
202
0
    std::lock_guard<std::mutex> lock(general_mutex_);
203
0
    return memory_used_;
204
0
  }
205
  // If the given index is not currently loaded, loads it using a separate
206
  // thread. Note: there are 4 cases:
207
  // Document uncached: IsCached() returns false, total_pages_ < 0.
208
  // Required page is available: IsPageAvailable returns true. In this case,
209
  // total_pages_ > 0 and
210
  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
211
  // Pages are loaded, but the required one is not.
212
  // The requested page is being loaded by LoadPageInBackground. In this case,
213
  // index == pages_offset_. Once the loading starts, the pages lock is held
214
  // until it completes, at which point IsPageAvailable will unblock and return
215
  // true.
216
  void LoadPageInBackground(int index);
217
  // Returns a pointer to the page with the given index, modulo the total
218
  // number of pages. Blocks until the background load is completed.
219
  TESS_API
220
  const ImageData *GetPage(int index);
221
  // Returns true if the requested page is available, and provides a pointer,
222
  // which may be nullptr if the document is empty. May block, even though it
223
  // doesn't guarantee to return true.
224
  bool IsPageAvailable(int index, ImageData **page);
225
  // Takes ownership of the given page index. The page is made nullptr in *this.
226
0
  ImageData *TakePage(int index) {
227
0
    std::lock_guard<std::mutex> lock(pages_mutex_);
228
0
    ImageData *page = pages_[index];
229
0
    pages_[index] = nullptr;
230
0
    return page;
231
0
  }
232
  // Returns true if the document is currently loaded or in the process of
233
  // loading.
234
0
  bool IsCached() const {
235
0
    return NumPages() >= 0;
236
0
  }
237
  // Removes all pages from memory and frees the memory, but does not forget
238
  // the document metadata. Returns the memory saved.
239
  int64_t UnCache();
240
  // Shuffles all the pages in the document.
241
  void Shuffle();
242
243
private:
244
  // Sets the value of total_pages_ behind a mutex.
245
0
  void set_total_pages(int total) {
246
0
    std::lock_guard<std::mutex> lock(general_mutex_);
247
0
    total_pages_ = total;
248
0
  }
249
0
  void set_memory_used(int64_t memory_used) {
250
0
    std::lock_guard<std::mutex> lock(general_mutex_);
251
0
    memory_used_ = memory_used;
252
0
  }
253
  // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_
254
  // starting at index pages_offset_.
255
  bool ReCachePages();
256
257
private:
258
  // A name for this document.
259
  std::string document_name_;
260
  // A group of pages that corresponds in some loose way to a document.
261
  std::vector<ImageData *> pages_;
262
  // Page number of the first index in pages_.
263
  int pages_offset_;
264
  // Total number of pages in document (may exceed size of pages_.)
265
  int total_pages_;
266
  // Total of all pix sizes in the document.
267
  int64_t memory_used_;
268
  // Max memory to use at any time.
269
  int64_t max_memory_;
270
  // Saved reader from LoadDocument to allow re-caching.
271
  FileReader reader_;
272
  // Mutex that protects pages_ and pages_offset_ against multiple parallel
273
  // loads, and provides a wait for page.
274
  std::mutex pages_mutex_;
275
  // Mutex that protects other data members that callers want to access without
276
  // waiting for a load operation.
277
  mutable std::mutex general_mutex_;
278
279
  // Thread which loads document.
280
  std::thread thread;
281
};
282
283
// A collection of DocumentData that knows roughly how much memory it is using.
284
// Note that while it supports background read-ahead, it assumes that a single
285
// thread is accessing documents, ie it is not safe for multiple threads to
286
// access different documents in parallel, as one may de-cache the other's
287
// content.
288
class DocumentCache {
289
public:
290
  TESS_API
291
  explicit DocumentCache(int64_t max_memory);
292
  TESS_API
293
  ~DocumentCache();
294
295
  // Deletes all existing documents from the cache.
296
0
  void Clear() {
297
0
    for (auto *document : documents_) {
298
0
      delete document;
299
0
    }
300
0
    documents_.clear();
301
0
    num_pages_per_doc_ = 0;
302
0
  }
303
  // Adds all the documents in the list of filenames, counting memory.
304
  // The reader is used to read the files.
305
  TESS_API
306
  bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,
307
                     FileReader reader);
308
309
  // Adds document to the cache.
310
  bool AddToCache(DocumentData *data);
311
312
  // Finds and returns a document by name.
313
  DocumentData *FindDocument(const std::string &document_name) const;
314
315
  // Returns a page by serial number using the current cache_strategy_ to
316
  // determine the mapping from serial number to page.
317
0
  const ImageData *GetPageBySerial(int serial) {
318
0
    if (cache_strategy_ == CS_SEQUENTIAL) {
319
0
      return GetPageSequential(serial);
320
0
    } else {
321
0
      return GetPageRoundRobin(serial);
322
0
    }
323
0
  }
324
325
0
  const std::vector<DocumentData *> &documents() const {
326
0
    return documents_;
327
0
  }
328
  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
329
  // strategy, could take a long time.
330
  TESS_API
331
  int TotalPages();
332
333
private:
334
  // Returns a page by serial number, selecting them in a round-robin fashion
335
  // from all the documents. Highly disk-intensive, but doesn't need samples
336
  // to be shuffled between files to begin with.
337
  TESS_API
338
  const ImageData *GetPageRoundRobin(int serial);
339
  // Returns a page by serial number, selecting them in sequence from each file.
340
  // Requires the samples to be shuffled between the files to give a random or
341
  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
342
  TESS_API
343
  const ImageData *GetPageSequential(int serial);
344
345
  // Helper counts the number of adjacent cached neighbour documents_ of index
346
  // looking in direction dir, ie index+dir, index+2*dir etc.
347
  int CountNeighbourDocs(int index, int dir);
348
349
  // A group of pages that corresponds in some loose way to a document.
350
  std::vector<DocumentData *> documents_;
351
  // Strategy to use for caching and serializing data samples.
352
  CachingStrategy cache_strategy_ = CS_SEQUENTIAL;
353
  // Number of pages in the first document, used as a divisor in
354
  // GetPageSequential to determine the document index.
355
  int num_pages_per_doc_ = 0;
356
  // Max memory allowed in this cache.
357
  int64_t max_memory_ = 0;
358
};
359
360
} // namespace tesseract
361
362
#endif // TESSERACT_IMAGE_IMAGEDATA_H_