/src/tesseract/src/ccstruct/imagedata.h

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        imagedata.h
// Description: Class to hold information about a single image and its
//              corresponding boxes or text file.
// Author:      Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
#define TESSERACT_IMAGE_IMAGEDATA_H_

#include "image.h"
#include "points.h" // for FCOORD

#include <mutex>  // for std::mutex
#include <thread> // for std::thread

struct Pix;

namespace tesseract {

class TFile;
class ScrollView;
class TBOX;

// Amount of padding to apply in output pixels in feature mode.
const int kFeaturePadding = 2;
// Number of pixels to pad around text boxes.
const int kImagePadding = 4;

// Enum to determine the caching and data sequencing strategy.
enum CachingStrategy {
  // Reads all of one file before moving on to the next. Requires samples to be
  // shuffled across files. Uses the count of samples in the first file as
  // the count in all the files to achieve high-speed random access. As a
  // consequence, if subsequent files are smaller, they get entries used more
  // than once, and if subsequent files are larger, some entries are not used.
  // Best for larger data sets that don't fit in memory.
  CS_SEQUENTIAL,
  // Reads one sample from each file in rotation. Does not require shuffled
  // samples, but is extremely disk-intensive. Samples in smaller files also
  // get used more often than samples in larger files.
  // Best for smaller data sets that mostly fit in memory.
  CS_ROUND_ROBIN,
};

// Class to hold information on a single image:
// Filename, cached image as a Pix*, character boxes, text transcription.
// The text transcription is the ground truth UTF-8 text for the image.
// Character boxes are optional and indicate the desired segmentation of
// the text into recognition units.
class TESS_API ImageData {
public:
  ImageData();
  // Takes ownership of the pix.
  ImageData(bool vertical, Image pix);
  ~ImageData();

  // Builds and returns an ImageData from the basic data. Note that imagedata,
  // truth_text, and box_text are all the actual file data, NOT filenames.
  static ImageData *Build(const char *name, int page_number, const char *lang,
                          const char *imagedata, int imagedatasize, const char *truth_text,
                          const char *box_text);

  // Writes to the given file. Returns false in case of error.
  bool Serialize(TFile *fp) const;
  // Reads from the given file. Returns false in case of error.
  bool DeSerialize(TFile *fp);
  // As DeSerialize, but only seeks past the data - hence a static method.
  static bool SkipDeSerialize(TFile *fp);

  // Other accessors.
  const std::string &imagefilename() const {
    return imagefilename_;
  }
  void set_imagefilename(const std::string &name) {
    imagefilename_ = name;
  }
  int page_number() const {
    return page_number_;
  }
  void set_page_number(int num) {
    page_number_ = num;
  }
  const std::vector<char> &image_data() const {
    return image_data_;
  }
  const std::string &language() const {
    return language_;
  }
  void set_language(const std::string &lang) {
    language_ = lang;
  }
  const std::string &transcription() const {
    return transcription_;
  }
  const std::vector<TBOX> &boxes() const {
    return boxes_;
  }
  const std::vector<std::string> &box_texts() const {
    return box_texts_;
  }
  const std::string &box_text(int index) const {
    return box_texts_[index];
  }
  // Saves the given Pix as a PNG-encoded string and destroys it.
  // In case of missing PNG support in Leptonica use PNM format,
  // which requires more memory.
  void SetPix(Image pix);
  // Returns the Pix image for *this. Must be pixDestroyed after use.
  Image GetPix() const;
  // Gets anything and everything with a non-nullptr pointer, prescaled to a
  // given target_height (if 0, then the original image height), and aligned.
  // Also returns (if not nullptr) the width and height of the scaled image.
  // The return value is the scaled Pix, which must be pixDestroyed after use,
  // and scale_factor (if not nullptr) is set to the scale factor that was
  // applied to the image to achieve the target_height.
  Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width,
                int *scaled_height, std::vector<TBOX> *boxes) const;

  int MemoryUsed() const;

  // Draws the data in a new window.
  void Display() const;

  // Adds the supplied boxes and transcriptions that correspond to the correct
  // page number.
  void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
                const std::vector<int> &box_pages);

private:
  // Saves the given Pix as a PNG-encoded string and destroys it.
  // In case of missing PNG support in Leptonica use PNM format,
  // which requires more memory.
  static void SetPixInternal(Image pix, std::vector<char> *image_data);
  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
  static Image GetPixInternal(const std::vector<char> &image_data);
  // Parses the text string as a box file and adds any discovered boxes that
  // match the page number. Returns false on error.
  bool AddBoxes(const char *box_text);

private:
  std::string imagefilename_; // File to read image from.
  int32_t page_number_;  // Page number if multi-page tif or -1.
  // see https://github.com/tesseract-ocr/tesseract/pull/2965
  // EP: reconsider for tess6.0/opencv
#ifdef TESSERACT_IMAGEDATA_AS_PIX
  Image internal_pix_;
#endif
  std::vector<char> image_data_;  // PNG/PNM file data.
  std::string language_;          // Language code for image.
  std::string transcription_;     // UTF-8 ground truth of image.
  std::vector<TBOX> boxes_;       // If non-empty boxes of the image.
  std::vector<std::string> box_texts_; // String for text in each box.
  bool vertical_text_;            // Image has been rotated from vertical.
};

// A collection of ImageData that knows roughly how much memory it is using.
class DocumentData {
public:
  TESS_API
  explicit DocumentData(const std::string &name);
  TESS_API
  ~DocumentData();

  // Reads all the pages in the given lstmf filename to the cache. The reader
  // is used to read the file.
  TESS_API
  bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);
  // Sets up the document, without actually loading it.
  void SetDocument(const char *filename, int64_t max_memory, FileReader reader);
  // Writes all the pages to the given filename. Returns false on error.
  TESS_API
  bool SaveDocument(const char *filename, FileWriter writer);

  // Adds the given page data to this document, counting up memory.
  TESS_API
  void AddPageToDocument(ImageData *page);

  const std::string &document_name() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return document_name_;
  }
  int NumPages() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return total_pages_;
  }
  size_t PagesSize() const {
    return pages_.size();
  }
  int64_t memory_used() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return memory_used_;
  }
  // If the given index is not currently loaded, loads it using a separate
  // thread. Note: there are 4 cases:
  // Document uncached: IsCached() returns false, total_pages_ < 0.
  // Required page is available: IsPageAvailable returns true. In this case,
  // total_pages_ > 0 and
  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
  // Pages are loaded, but the required one is not.
  // The requested page is being loaded by LoadPageInBackground. In this case,
  // index == pages_offset_. Once the loading starts, the pages lock is held
  // until it completes, at which point IsPageAvailable will unblock and return
  // true.
  void LoadPageInBackground(int index);
  // Returns a pointer to the page with the given index, modulo the total
  // number of pages. Blocks until the background load is completed.
  TESS_API
  const ImageData *GetPage(int index);
  // Returns true if the requested page is available, and provides a pointer,
  // which may be nullptr if the document is empty. May block, even though it
  // doesn't guarantee to return true.
  bool IsPageAvailable(int index, ImageData **page);
  // Takes ownership of the given page index. The page is made nullptr in *this.
  ImageData *TakePage(int index) {
    std::lock_guard<std::mutex> lock(pages_mutex_);
    ImageData *page = pages_[index];
    pages_[index] = nullptr;
    return page;
  }
  // Returns true if the document is currently loaded or in the process of
  // loading.
  bool IsCached() const {
    return NumPages() >= 0;
  }
  // Removes all pages from memory and frees the memory, but does not forget
  // the document metadata. Returns the memory saved.
  int64_t UnCache();
  // Shuffles all the pages in the document.
  void Shuffle();

private:
  // Sets the value of total_pages_ behind a mutex.
  void set_total_pages(int total) {
    std::lock_guard<std::mutex> lock(general_mutex_);
    total_pages_ = total;
  }
  void set_memory_used(int64_t memory_used) {
    std::lock_guard<std::mutex> lock(general_mutex_);
    memory_used_ = memory_used;
  }
  // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_
  // starting at index pages_offset_.
  bool ReCachePages();

private:
  // A name for this document.
  std::string document_name_;
  // A group of pages that corresponds in some loose way to a document.
  std::vector<ImageData *> pages_;
  // Page number of the first index in pages_.
  int pages_offset_;
  // Total number of pages in document (may exceed size of pages_.)
  int total_pages_;
  // Total of all pix sizes in the document.
  int64_t memory_used_;
  // Max memory to use at any time.
  int64_t max_memory_;
  // Saved reader from LoadDocument to allow re-caching.
  FileReader reader_;
  // Mutex that protects pages_ and pages_offset_ against multiple parallel
  // loads, and provides a wait for page.
  std::mutex pages_mutex_;
  // Mutex that protects other data members that callers want to access without
  // waiting for a load operation.
  mutable std::mutex general_mutex_;

  // Thread which loads document.
  std::thread thread;
};

// A collection of DocumentData that knows roughly how much memory it is using.
// Note that while it supports background read-ahead, it assumes that a single
// thread is accessing documents, ie it is not safe for multiple threads to
// access different documents in parallel, as one may de-cache the other's
// content.
class DocumentCache {
public:
  TESS_API
  explicit DocumentCache(int64_t max_memory);
  TESS_API
  ~DocumentCache();

  // Deletes all existing documents from the cache.
  void Clear() {
    for (auto *document : documents_) {
      delete document;
    }
    documents_.clear();
    num_pages_per_doc_ = 0;
  }
  // Adds all the documents in the list of filenames, counting memory.
  // The reader is used to read the files.
  TESS_API
  bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,
                     FileReader reader);

  // Adds document to the cache.
  bool AddToCache(DocumentData *data);

  // Finds and returns a document by name.
  DocumentData *FindDocument(const std::string &document_name) const;

  // Returns a page by serial number using the current cache_strategy_ to
  // determine the mapping from serial number to page.
  const ImageData *GetPageBySerial(int serial) {
    if (cache_strategy_ == CS_SEQUENTIAL) {
      return GetPageSequential(serial);
    } else {
      return GetPageRoundRobin(serial);
    }
  }

  const std::vector<DocumentData *> &documents() const {
    return documents_;
  }
  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
  // strategy, could take a long time.
  TESS_API
  int TotalPages();

private:
  // Returns a page by serial number, selecting them in a round-robin fashion
  // from all the documents. Highly disk-intensive, but doesn't need samples
  // to be shuffled between files to begin with.
  TESS_API
  const ImageData *GetPageRoundRobin(int serial);
  // Returns a page by serial number, selecting them in sequence from each file.
  // Requires the samples to be shuffled between the files to give a random or
  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
  TESS_API
  const ImageData *GetPageSequential(int serial);

  // Helper counts the number of adjacent cached neighbour documents_ of index
  // looking in direction dir, ie index+dir, index+2*dir etc.
  int CountNeighbourDocs(int index, int dir);

  // A group of pages that corresponds in some loose way to a document.
  std::vector<DocumentData *> documents_;
  // Strategy to use for caching and serializing data samples.
  CachingStrategy cache_strategy_ = CS_SEQUENTIAL;
  // Number of pages in the first document, used as a divisor in
  // GetPageSequential to determine the document index.
  int num_pages_per_doc_ = 0;
  // Max memory allowed in this cache.
  int64_t max_memory_ = 0;
};

} // namespace tesseract

#endif // TESSERACT_IMAGE_IMAGEDATA_H_

Coverage Report

Created: 2025-06-13 07:15

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: imagedata.h
3		// Description: Class to hold information about a single image and its
4		// corresponding boxes or text file.
5		// Author: Ray Smith
6		//
7		// (C) Copyright 2013, Google Inc.
8		// Licensed under the Apache License, Version 2.0 (the "License");
9		// you may not use this file except in compliance with the License.
10		// You may obtain a copy of the License at
11		// http://www.apache.org/licenses/LICENSE-2.0
12		// Unless required by applicable law or agreed to in writing, software
13		// distributed under the License is distributed on an "AS IS" BASIS,
14		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		// See the License for the specific language governing permissions and
16		// limitations under the License.
17		///////////////////////////////////////////////////////////////////////
18
19		#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
20		#define TESSERACT_IMAGE_IMAGEDATA_H_
21
22		#include "image.h"
23		#include "points.h" // for FCOORD
24
25		#include <mutex> // for std::mutex
26		#include <thread> // for std::thread
27
28		struct Pix;
29
30		namespace tesseract {
31
32		class TFile;
33		class ScrollView;
34		class TBOX;
35
36		// Amount of padding to apply in output pixels in feature mode.
37		const int kFeaturePadding = 2;
38		// Number of pixels to pad around text boxes.
39		const int kImagePadding = 4;
40
41		// Enum to determine the caching and data sequencing strategy.
42		enum CachingStrategy {
43		// Reads all of one file before moving on to the next. Requires samples to be
44		// shuffled across files. Uses the count of samples in the first file as
45		// the count in all the files to achieve high-speed random access. As a
46		// consequence, if subsequent files are smaller, they get entries used more
47		// than once, and if subsequent files are larger, some entries are not used.
48		// Best for larger data sets that don't fit in memory.
49		CS_SEQUENTIAL,
50		// Reads one sample from each file in rotation. Does not require shuffled
51		// samples, but is extremely disk-intensive. Samples in smaller files also
52		// get used more often than samples in larger files.
53		// Best for smaller data sets that mostly fit in memory.
54		CS_ROUND_ROBIN,
55		};
56
57		// Class to hold information on a single image:
58		// Filename, cached image as a Pix*, character boxes, text transcription.
59		// The text transcription is the ground truth UTF-8 text for the image.
60		// Character boxes are optional and indicate the desired segmentation of
61		// the text into recognition units.
62		class TESS_API ImageData {
63		public:
64		ImageData();
65		// Takes ownership of the pix.
66		ImageData(bool vertical, Image pix);
67		~ImageData();
68
69		// Builds and returns an ImageData from the basic data. Note that imagedata,
70		// truth_text, and box_text are all the actual file data, NOT filenames.
71		static ImageData Build(const char name, int page_number, const char *lang,
72		const char imagedata, int imagedatasize, const char truth_text,
73		const char *box_text);
74
75		// Writes to the given file. Returns false in case of error.
76		bool Serialize(TFile *fp) const;
77		// Reads from the given file. Returns false in case of error.
78		bool DeSerialize(TFile *fp);
79		// As DeSerialize, but only seeks past the data - hence a static method.
80		static bool SkipDeSerialize(TFile *fp);
81
82		// Other accessors.
83	0	const std::string &imagefilename() const {
84	0	return imagefilename_;
85	0	}
86	0	void set_imagefilename(const std::string &name) {
87	0	imagefilename_ = name;
88	0	}
89	0	int page_number() const {
90	0	return page_number_;
91	0	}
92	0	void set_page_number(int num) {
93	0	page_number_ = num;
94	0	}
95	0	const std::vector<char> &image_data() const {
96	0	return image_data_;
97	0	}
98	0	const std::string &language() const {
99	0	return language_;
100	0	}
101	0	void set_language(const std::string &lang) {
102	0	language_ = lang;
103	0	}
104	0	const std::string &transcription() const {
105	0	return transcription_;
106	0	}
107	0	const std::vector<TBOX> &boxes() const {
108	0	return boxes_;
109	0	}
110	0	const std::vector<std::string> &box_texts() const {
111	0	return box_texts_;
112	0	}
113	0	const std::string &box_text(int index) const {
114	0	return box_texts_[index];
115	0	}
116		// Saves the given Pix as a PNG-encoded string and destroys it.
117		// In case of missing PNG support in Leptonica use PNM format,
118		// which requires more memory.
119		void SetPix(Image pix);
120		// Returns the Pix image for *this. Must be pixDestroyed after use.
121		Image GetPix() const;
122		// Gets anything and everything with a non-nullptr pointer, prescaled to a
123		// given target_height (if 0, then the original image height), and aligned.
124		// Also returns (if not nullptr) the width and height of the scaled image.
125		// The return value is the scaled Pix, which must be pixDestroyed after use,
126		// and scale_factor (if not nullptr) is set to the scale factor that was
127		// applied to the image to achieve the target_height.
128		Image PreScale(int target_height, int max_height, float scale_factor, int scaled_width,
129		int scaled_height, std::vector<TBOX> boxes) const;
130
131		int MemoryUsed() const;
132
133		// Draws the data in a new window.
134		void Display() const;
135
136		// Adds the supplied boxes and transcriptions that correspond to the correct
137		// page number.
138		void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
139		const std::vector<int> &box_pages);
140
141		private:
142		// Saves the given Pix as a PNG-encoded string and destroys it.
143		// In case of missing PNG support in Leptonica use PNM format,
144		// which requires more memory.
145		static void SetPixInternal(Image pix, std::vector<char> *image_data);
146		// Returns the Pix image for the image_data. Must be pixDestroyed after use.
147		static Image GetPixInternal(const std::vector<char> &image_data);
148		// Parses the text string as a box file and adds any discovered boxes that
149		// match the page number. Returns false on error.
150		bool AddBoxes(const char *box_text);
151
152		private:
153		std::string imagefilename_; // File to read image from.
154		int32_t page_number_; // Page number if multi-page tif or -1.
155		// see https://github.com/tesseract-ocr/tesseract/pull/2965
156		// EP: reconsider for tess6.0/opencv
157		#ifdef TESSERACT_IMAGEDATA_AS_PIX
158		Image internal_pix_;
159		#endif
160		std::vector<char> image_data_; // PNG/PNM file data.
161		std::string language_; // Language code for image.
162		std::string transcription_; // UTF-8 ground truth of image.
163		std::vector<TBOX> boxes_; // If non-empty boxes of the image.
164		std::vector<std::string> box_texts_; // String for text in each box.
165		bool vertical_text_; // Image has been rotated from vertical.
166		};
167
168		// A collection of ImageData that knows roughly how much memory it is using.
169		class DocumentData {
170		public:
171		TESS_API
172		explicit DocumentData(const std::string &name);
173		TESS_API
174		~DocumentData();
175
176		// Reads all the pages in the given lstmf filename to the cache. The reader
177		// is used to read the file.
178		TESS_API
179		bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);
180		// Sets up the document, without actually loading it.
181		void SetDocument(const char *filename, int64_t max_memory, FileReader reader);
182		// Writes all the pages to the given filename. Returns false on error.
183		TESS_API
184		bool SaveDocument(const char *filename, FileWriter writer);
185
186		// Adds the given page data to this document, counting up memory.
187		TESS_API
188		void AddPageToDocument(ImageData *page);
189
190	0	const std::string &document_name() const {
191	0	std::lock_guard<std::mutex> lock(general_mutex_);
192	0	return document_name_;
193	0	}
194	0	int NumPages() const {
195	0	std::lock_guard<std::mutex> lock(general_mutex_);
196	0	return total_pages_;
197	0	}
198	0	size_t PagesSize() const {
199	0	return pages_.size();
200	0	}
201	0	int64_t memory_used() const {
202	0	std::lock_guard<std::mutex> lock(general_mutex_);
203	0	return memory_used_;
204	0	}
205		// If the given index is not currently loaded, loads it using a separate
206		// thread. Note: there are 4 cases:
207		// Document uncached: IsCached() returns false, total_pages_ < 0.
208		// Required page is available: IsPageAvailable returns true. In this case,
209		// total_pages_ > 0 and
210		// pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
211		// Pages are loaded, but the required one is not.
212		// The requested page is being loaded by LoadPageInBackground. In this case,
213		// index == pages_offset_. Once the loading starts, the pages lock is held
214		// until it completes, at which point IsPageAvailable will unblock and return
215		// true.
216		void LoadPageInBackground(int index);
217		// Returns a pointer to the page with the given index, modulo the total
218		// number of pages. Blocks until the background load is completed.
219		TESS_API
220		const ImageData *GetPage(int index);
221		// Returns true if the requested page is available, and provides a pointer,
222		// which may be nullptr if the document is empty. May block, even though it
223		// doesn't guarantee to return true.
224		bool IsPageAvailable(int index, ImageData **page);
225		// Takes ownership of the given page index. The page is made nullptr in *this.
226	0	ImageData *TakePage(int index) {
227	0	std::lock_guard<std::mutex> lock(pages_mutex_);
228	0	ImageData *page = pages_[index];
229	0	pages_[index] = nullptr;
230	0	return page;
231	0	}
232		// Returns true if the document is currently loaded or in the process of
233		// loading.
234	0	bool IsCached() const {
235	0	return NumPages() >= 0;
236	0	}
237		// Removes all pages from memory and frees the memory, but does not forget
238		// the document metadata. Returns the memory saved.
239		int64_t UnCache();
240		// Shuffles all the pages in the document.
241		void Shuffle();
242
243		private:
244		// Sets the value of total_pages_ behind a mutex.
245	0	void set_total_pages(int total) {
246	0	std::lock_guard<std::mutex> lock(general_mutex_);
247	0	total_pages_ = total;
248	0	}
249	0	void set_memory_used(int64_t memory_used) {
250	0	std::lock_guard<std::mutex> lock(general_mutex_);
251	0	memory_used_ = memory_used;
252	0	}
253		// Locks the pages_mutex_ and loads as many pages as will fit into max_memory_
254		// starting at index pages_offset_.
255		bool ReCachePages();
256
257		private:
258		// A name for this document.
259		std::string document_name_;
260		// A group of pages that corresponds in some loose way to a document.
261		std::vector<ImageData *> pages_;
262		// Page number of the first index in pages_.
263		int pages_offset_;
264		// Total number of pages in document (may exceed size of pages_.)
265		int total_pages_;
266		// Total of all pix sizes in the document.
267		int64_t memory_used_;
268		// Max memory to use at any time.
269		int64_t max_memory_;
270		// Saved reader from LoadDocument to allow re-caching.
271		FileReader reader_;
272		// Mutex that protects pages_ and pages_offset_ against multiple parallel
273		// loads, and provides a wait for page.
274		std::mutex pages_mutex_;
275		// Mutex that protects other data members that callers want to access without
276		// waiting for a load operation.
277		mutable std::mutex general_mutex_;
278
279		// Thread which loads document.
280		std::thread thread;
281		};
282
283		// A collection of DocumentData that knows roughly how much memory it is using.
284		// Note that while it supports background read-ahead, it assumes that a single
285		// thread is accessing documents, ie it is not safe for multiple threads to
286		// access different documents in parallel, as one may de-cache the other's
287		// content.
288		class DocumentCache {
289		public:
290		TESS_API
291		explicit DocumentCache(int64_t max_memory);
292		TESS_API
293		~DocumentCache();
294
295		// Deletes all existing documents from the cache.
296	0	void Clear() {
297	0	for (auto *document : documents_) {
298	0	delete document;
299	0	}
300	0	documents_.clear();
301	0	num_pages_per_doc_ = 0;
302	0	}
303		// Adds all the documents in the list of filenames, counting memory.
304		// The reader is used to read the files.
305		TESS_API
306		bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,
307		FileReader reader);
308
309		// Adds document to the cache.
310		bool AddToCache(DocumentData *data);
311
312		// Finds and returns a document by name.
313		DocumentData *FindDocument(const std::string &document_name) const;
314
315		// Returns a page by serial number using the current cache_strategy_ to
316		// determine the mapping from serial number to page.
317	0	const ImageData *GetPageBySerial(int serial) {
318	0	if (cache_strategy_ == CS_SEQUENTIAL) {
319	0	return GetPageSequential(serial);
320	0	} else {
321	0	return GetPageRoundRobin(serial);
322	0	}
323	0	}
324
325	0	const std::vector<DocumentData *> &documents() const {
326	0	return documents_;
327	0	}
328		// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
329		// strategy, could take a long time.
330		TESS_API
331		int TotalPages();
332
333		private:
334		// Returns a page by serial number, selecting them in a round-robin fashion
335		// from all the documents. Highly disk-intensive, but doesn't need samples
336		// to be shuffled between files to begin with.
337		TESS_API
338		const ImageData *GetPageRoundRobin(int serial);
339		// Returns a page by serial number, selecting them in sequence from each file.
340		// Requires the samples to be shuffled between the files to give a random or
341		// uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
342		TESS_API
343		const ImageData *GetPageSequential(int serial);
344
345		// Helper counts the number of adjacent cached neighbour documents_ of index
346		// looking in direction dir, ie index+dir, index+2*dir etc.
347		int CountNeighbourDocs(int index, int dir);
348
349		// A group of pages that corresponds in some loose way to a document.
350		std::vector<DocumentData *> documents_;
351		// Strategy to use for caching and serializing data samples.
352		CachingStrategy cache_strategy_ = CS_SEQUENTIAL;
353		// Number of pages in the first document, used as a divisor in
354		// GetPageSequential to determine the document index.
355		int num_pages_per_doc_ = 0;
356		// Max memory allowed in this cache.
357		int64_t max_memory_ = 0;
358		};
359
360		} // namespace tesseract
361
362		#endif // TESSERACT_IMAGE_IMAGEDATA_H_