/src/tesseract/src/ccmain/tesseractclass.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | ///////////////////////////////////////////////////////////////////////  | 
2  |  | // File:        tesseractclass.cpp  | 
3  |  | // Description: The Tesseract class. It holds/owns everything needed  | 
4  |  | //              to run Tesseract on a single language, and also a set of  | 
5  |  | //              sub-Tesseracts to run sub-languages. For thread safety, *every*  | 
6  |  | //              variable that was previously global or static (except for  | 
7  |  | //              constant data, and some visual debugging flags) has been moved  | 
8  |  | //              in here, directly, or indirectly.  | 
9  |  | //              This makes it safe to run multiple Tesseracts in different  | 
10  |  | //              threads in parallel, and keeps the different language  | 
11  |  | //              instances separate.  | 
12  |  | //              Some global functions remain, but they are isolated re-entrant  | 
13  |  | //              functions that operate on their arguments. Functions that work  | 
14  |  | //              on variable data have been moved to an appropriate class based  | 
15  |  | //              mostly on the directory hierarchy. For more information see  | 
16  |  | //              slide 6 of "2ArchitectureAndDataStructures" in  | 
17  |  | // https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing  | 
18  |  | //              Some global data and related functions still exist in the  | 
19  |  | //              training-related code, but they don't interfere with normal  | 
20  |  | //              recognition operation.  | 
21  |  | // Author:      Ray Smith  | 
22  |  | //  | 
23  |  | // (C) Copyright 2008, Google Inc.  | 
24  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
25  |  | // you may not use this file except in compliance with the License.  | 
26  |  | // You may obtain a copy of the License at  | 
27  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
28  |  | // Unless required by applicable law or agreed to in writing, software  | 
29  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
30  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
31  |  | // See the License for the specific language governing permissions and  | 
32  |  | // limitations under the License.  | 
33  |  | //  | 
34  |  | ///////////////////////////////////////////////////////////////////////  | 
35  |  |  | 
36  |  | // Include automatically generated configuration file if running autoconf.  | 
37  |  | #ifdef HAVE_CONFIG_H  | 
38  |  | #  include "config_auto.h"  | 
39  |  | #endif  | 
40  |  |  | 
41  |  | #include "tesseractclass.h"  | 
42  |  |  | 
43  |  | #include <allheaders.h>  | 
44  |  | #include "edgblob.h"  | 
45  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
46  |  | #  include "equationdetect.h"  | 
47  |  | #endif  | 
48  |  | #include "lstmrecognizer.h"  | 
49  |  | #include "thresholder.h" // for ThresholdMethod  | 
50  |  |  | 
51  |  | namespace tesseract { | 
52  |  |  | 
53  |  | Tesseract::Tesseract()  | 
54  | 4  |     : BOOL_MEMBER(tessedit_resegment_from_boxes, false,  | 
55  |  |                   "Take segmentation and labeling from box file", this->params())  | 
56  | 4  |     , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,  | 
57  |  |                   "Conversion of word/line box file to char box file", this->params())  | 
58  | 4  |     , BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",  | 
59  |  |                   this->params())  | 
60  | 4  |     , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",  | 
61  |  |                   this->params())  | 
62  | 4  |     , BOOL_MEMBER(tessedit_train_line_recognizer, false,  | 
63  |  |                   "Break input into lines and remap boxes if present", this->params())  | 
64  | 4  |     , BOOL_MEMBER(tessedit_dump_pageseg_images, false,  | 
65  |  |                   "Dump intermediate images made during page segmentation", this->params())  | 
66  |  |     // TODO: remove deprecated tessedit_do_invert in release 6.  | 
67  | 4  |     , BOOL_MEMBER(tessedit_do_invert, true,  | 
68  |  |                   "Try inverted line image if necessary (deprecated, will be "  | 
69  |  |                   "removed in release 6, use the 'invert_threshold' parameter instead)",  | 
70  |  |                   this->params())  | 
71  | 4  |     , double_MEMBER(invert_threshold, 0.7,  | 
72  |  |                     "For lines with a mean confidence below this value, OCR is also tried with an inverted image",  | 
73  |  |                     this->params())  | 
74  |  |     ,  | 
75  |  |     // The default for pageseg_mode is the old behaviour, so as not to  | 
76  |  |     // upset anything that relies on that.  | 
77  | 4  |     INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,  | 
78  |  |                "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "  | 
79  |  |                "4=column,"  | 
80  |  |                " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"  | 
81  |  |                "11=sparse_text, 12=sparse_text+osd, 13=raw_line"  | 
82  |  |                " (Values from PageSegMode enum in tesseract/publictypes.h)",  | 
83  |  |                this->params())  | 
84  | 4  |     , INT_MEMBER(thresholding_method,  | 
85  |  |                  static_cast<int>(ThresholdMethod::Otsu),  | 
86  |  |                  "Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = "  | 
87  |  |                  "Sauvola",  | 
88  |  |                  this->params())  | 
89  | 4  |     , BOOL_MEMBER(thresholding_debug, false,  | 
90  |  |                   "Debug the thresholding process",  | 
91  |  |                   this->params())  | 
92  | 4  |     , double_MEMBER(thresholding_window_size, 0.33,  | 
93  |  |                     "Window size for measuring local statistics (to be "  | 
94  |  |                     "multiplied by image DPI). "  | 
95  |  |                     "This parameter is used by the Sauvola thresholding method",  | 
96  |  |                     this->params())  | 
97  | 4  |     , double_MEMBER(thresholding_kfactor, 0.34,  | 
98  |  |                     "Factor for reducing threshold due to variance. "  | 
99  |  |                     "This parameter is used by the Sauvola thresholding method."  | 
100  |  |                     " Normal range: 0.2-0.5",  | 
101  |  |                     this->params())  | 
102  | 4  |     , double_MEMBER(thresholding_tile_size, 0.33,  | 
103  |  |                     "Desired tile size (to be multiplied by image DPI). "  | 
104  |  |                     "This parameter is used by the LeptonicaOtsu thresholding "  | 
105  |  |                     "method",  | 
106  |  |                     this->params())  | 
107  | 4  |     , double_MEMBER(thresholding_smooth_kernel_size, 0.0,  | 
108  |  |                     "Size of convolution kernel applied to threshold array "  | 
109  |  |                     "(to be multiplied by image DPI). Use 0 for no smoothing. "  | 
110  |  |                     "This parameter is used by the LeptonicaOtsu thresholding "  | 
111  |  |                     "method",  | 
112  |  |                     this->params())  | 
113  | 4  |     , double_MEMBER(thresholding_score_fraction, 0.1,  | 
114  |  |                     "Fraction of the max Otsu score. "  | 
115  |  |                     "This parameter is used by the LeptonicaOtsu thresholding "  | 
116  |  |                     "method. "  | 
117  |  |                     "For standard Otsu use 0.0, otherwise 0.1 is recommended",  | 
118  |  |                     this->params())  | 
119  | 4  |     , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,  | 
120  |  |                       "Which OCR engine(s) to run (Tesseract, LSTM, both)."  | 
121  |  |                       " Defaults to loading and running the most accurate"  | 
122  |  |                       " available.",  | 
123  |  |                       this->params())  | 
124  | 4  |     , STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",  | 
125  |  |                     this->params())  | 
126  | 4  |     , STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())  | 
127  | 4  |     , STRING_MEMBER(tessedit_char_unblacklist, "",  | 
128  |  |                     "List of chars to override tessedit_char_blacklist", this->params())  | 
129  | 4  |     , BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",  | 
130  |  |                   this->params())  | 
131  | 4  |     , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,  | 
132  |  |                  "Whether to use the top-line splitting process for Devanagari "  | 
133  |  |                  "documents while performing page-segmentation.",  | 
134  |  |                  this->params())  | 
135  | 4  |     , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,  | 
136  |  |                  "Whether to use the top-line splitting process for Devanagari "  | 
137  |  |                  "documents while performing ocr.",  | 
138  |  |                  this->params())  | 
139  | 4  |     , STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",  | 
140  |  |                     this->params())  | 
141  | 4  |     , BOOL_MEMBER(tessedit_adaption_debug, false,  | 
142  |  |                   "Generate and print debug"  | 
143  |  |                   " information for adaption",  | 
144  |  |                   this->params())  | 
145  | 4  |     , INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())  | 
146  | 4  |     , INT_MEMBER(applybox_debug, 1, "Debug level", this->params())  | 
147  | 4  |     , INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())  | 
148  | 4  |     , STRING_MEMBER(applybox_exposure_pattern, ".exp",  | 
149  |  |                     "Exposure value follows"  | 
150  |  |                     " this pattern in the image filename. The name of the image"  | 
151  |  |                     " files are expected to be in the form"  | 
152  |  |                     " [lang].[fontname].exp[num].tif",  | 
153  |  |                     this->params())  | 
154  | 4  |     , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,  | 
155  |  |                   "Learn both character fragments (as is done in the"  | 
156  |  |                   " special low exposure mode) as well as unfragmented"  | 
157  |  |                   " characters.",  | 
158  |  |                   this->params())  | 
159  | 4  |     , BOOL_MEMBER(applybox_learn_ngrams_mode, false,  | 
160  |  |                   "Each bounding box"  | 
161  |  |                   " is assumed to contain ngrams. Only learn the ngrams"  | 
162  |  |                   " whose outlines overlap horizontally.",  | 
163  |  |                   this->params())  | 
164  | 4  |     , BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())  | 
165  | 4  |     , BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())  | 
166  | 4  |     , BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())  | 
167  | 4  |     , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())  | 
168  | 4  |     , BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",  | 
169  |  |                   this->params())  | 
170  | 4  |     , BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())  | 
171  | 4  |     , BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",  | 
172  |  |                   this->params())  | 
173  | 4  |     , BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())  | 
174  | 4  |     , INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())  | 
175  | 4  |     , BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())  | 
176  | 4  |     , BOOL_MEMBER(tessedit_enable_bigram_correction, true,  | 
177  |  |                   "Enable correction based on the word bigram dictionary.", this->params())  | 
178  | 4  |     , BOOL_MEMBER(tessedit_enable_dict_correction, false,  | 
179  |  |                   "Enable single word correction based on the dictionary.", this->params())  | 
180  | 4  |     , INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",  | 
181  |  |                  this->params())  | 
182  | 4  |     , BOOL_MEMBER(enable_noise_removal, true,  | 
183  |  |                   "Remove and conditionally reassign small outlines when they"  | 
184  |  |                   " confuse layout analysis, determining diacritics vs noise",  | 
185  |  |                   this->params())  | 
186  | 4  |     , INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())  | 
187  |  |     ,  | 
188  |  |     // Worst (min) certainty, for which a diacritic is allowed to make the  | 
189  |  |     // base  | 
190  |  |     // character worse and still be included.  | 
191  | 4  |     double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())  | 
192  |  |     ,  | 
193  |  |     // Worst (min) certainty, for which a non-overlapping diacritic is allowed  | 
194  |  |     // to make the base character worse and still be included.  | 
195  | 4  |     double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())  | 
196  |  |     ,  | 
197  |  |     // Worst (min) certainty, for which a diacritic is allowed to make a new  | 
198  |  |     // stand-alone blob.  | 
199  | 4  |     double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())  | 
200  |  |     ,  | 
201  |  |     // Factor of certainty margin for adding diacritics to not count as worse.  | 
202  | 4  |     double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",  | 
203  |  |                   this->params())  | 
204  | 4  |     , INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())  | 
205  | 4  |     , INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())  | 
206  | 4  |     , INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())  | 
207  | 4  |     , STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params()) | 
208  | 4  |     , STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())  | 
209  | 4  |     , STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())  | 
210  | 4  |     , double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())  | 
211  | 4  |     , double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())  | 
212  | 4  |     , double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",  | 
213  |  |                     this->params())  | 
214  | 4  |     , double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())  | 
215  | 4  |     , INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())  | 
216  | 4  |     , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",  | 
217  |  |                  this->params())  | 
218  | 4  |     , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",  | 
219  |  |                   this->params())  | 
220  | 4  |     , BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())  | 
221  | 4  |     , BOOL_MEMBER(test_pt, false, "Test for point", this->params())  | 
222  | 4  |     , double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())  | 
223  | 4  |     , double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())  | 
224  | 4  |     , INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())  | 
225  | 4  |     , INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())  | 
226  | 4  |     , BOOL_MEMBER(paragraph_text_based, true,  | 
227  |  |                   "Run paragraph detection on the post-text-recognition "  | 
228  |  |                   "(more accurate)",  | 
229  |  |                   this->params())  | 
230  | 4  |     , BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())  | 
231  | 4  |     , STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())  | 
232  | 4  |     , STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())  | 
233  | 4  |     , BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",  | 
234  |  |                   this->params())  | 
235  | 4  |     , BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())  | 
236  | 4  |     , double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",  | 
237  |  |                     this->params())  | 
238  | 4  |     , double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",  | 
239  |  |                     this->params())  | 
240  | 4  |     , double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",  | 
241  |  |                     this->params())  | 
242  | 4  |     , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,  | 
243  |  |                     "Number of row rejects in whole word rejects"  | 
244  |  |                     " which prevents whole row rejection",  | 
245  |  |                     this->params())  | 
246  | 4  |     , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,  | 
247  |  |                   "Only rej partially rejected words in block rejection", this->params())  | 
248  | 4  |     , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,  | 
249  |  |                   "Only rej partially rejected words in row rejection", this->params())  | 
250  | 4  |     , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",  | 
251  |  |                   this->params())  | 
252  | 4  |     , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",  | 
253  |  |                   this->params())  | 
254  | 4  |     , INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",  | 
255  |  |                  this->params())  | 
256  | 4  |     , BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",  | 
257  |  |                   this->params())  | 
258  | 4  |     , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,  | 
259  |  |                     "rej good doc wd if more than this fraction rejected", this->params())  | 
260  | 4  |     , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())  | 
261  | 4  |     , BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())  | 
262  | 4  |     , BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",  | 
263  |  |                   this->params())  | 
264  | 4  |     , BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())  | 
265  | 4  |     , double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())  | 
266  | 4  |     , BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())  | 
267  | 4  |     , BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())  | 
268  | 4  |     , BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",  | 
269  |  |                   this->params())  | 
270  | 4  |     , BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())  | 
271  | 4  |     , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())  | 
272  | 4  |     , double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())  | 
273  | 4  |     , BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())  | 
274  | 4  |     , double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())  | 
275  | 4  |     , double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())  | 
276  | 4  |     , double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())  | 
277  | 4  |     , double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())  | 
278  | 4  |     , double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())  | 
279  | 4  |     , double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())  | 
280  | 4  |     , double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())  | 
281  | 4  |     , double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())  | 
282  | 4  |     , double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())  | 
283  | 4  |     , double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())  | 
284  | 4  |     , double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())  | 
285  | 4  |     , double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())  | 
286  | 4  |     , INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())  | 
287  | 4  |     , INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())  | 
288  | 4  |     , BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())  | 
289  | 4  |     , BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())  | 
290  | 4  |     , BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",  | 
291  |  |                   this->params())  | 
292  | 4  |     , BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())  | 
293  | 4  |     , INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",  | 
294  |  |                  this->params())  | 
295  | 4  |     , INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",  | 
296  |  |                  this->params())  | 
297  | 4  |     , INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())  | 
298  | 4  |     , INT_MEMBER(crunch_debug, 0, "As it says", this->params())  | 
299  | 4  |     , INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())  | 
300  | 4  |     , double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())  | 
301  | 4  |     , BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())  | 
302  | 4  |     , INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())  | 
303  | 4  |     , INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())  | 
304  | 4  |     , STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())  | 
305  | 4  |     , INT_MEMBER(x_ht_acceptance_tolerance, 8,  | 
306  |  |                  "Max allowed deviation of blob top outside of font data", this->params())  | 
307  | 4  |     , INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())  | 
308  | 4  |     , INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())  | 
309  | 4  |     , double_MEMBER(superscript_worse_certainty, 2.0,  | 
310  |  |                     "How many times worse "  | 
311  |  |                     "certainty does a superscript position glyph need to be for "  | 
312  |  |                     "us to try classifying it as a char with a different "  | 
313  |  |                     "baseline?",  | 
314  |  |                     this->params())  | 
315  | 4  |     , double_MEMBER(superscript_bettered_certainty, 0.97,  | 
316  |  |                     "What reduction in "  | 
317  |  |                     "badness do we think sufficient to choose a superscript "  | 
318  |  |                     "over what we'd thought.  For example, a value of 0.6 means "  | 
319  |  |                     "we want to reduce badness of certainty by at least 40%",  | 
320  |  |                     this->params())  | 
321  | 4  |     , double_MEMBER(superscript_scaledown_ratio, 0.4,  | 
322  |  |                     "A superscript scaled down more than this is unbelievably "  | 
323  |  |                     "small.  For example, 0.3 means we expect the font size to "  | 
324  |  |                     "be no smaller than 30% of the text line font size.",  | 
325  |  |                     this->params())  | 
326  | 4  |     , double_MEMBER(subscript_max_y_top, 0.5,  | 
327  |  |                     "Maximum top of a character measured as a multiple of "  | 
328  |  |                     "x-height above the baseline for us to reconsider whether "  | 
329  |  |                     "it's a subscript.",  | 
330  |  |                     this->params())  | 
331  | 4  |     , double_MEMBER(superscript_min_y_bottom, 0.3,  | 
332  |  |                     "Minimum bottom of a character measured as a multiple of "  | 
333  |  |                     "x-height above the baseline for us to reconsider whether "  | 
334  |  |                     "it's a superscript.",  | 
335  |  |                     this->params())  | 
336  | 4  |     , BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",  | 
337  |  |                   this->params())  | 
338  | 4  |     , BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())  | 
339  | 4  |     , BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())  | 
340  | 4  |     , BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())  | 
341  | 4  |     , BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())  | 
342  | 4  |     , BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())  | 
343  | 4  |     , BOOL_MEMBER(tessedit_create_page_xml, false, "Write .page.xml PAGE file", this->params())  | 
344  | 4  |     , BOOL_MEMBER(page_xml_polygon, true, "Create the PAGE file with polygons instead of box values", this->params())  | 
345  | 4  |     , INT_MEMBER(page_xml_level, 0, "Create the PAGE file on 0=line or 1=word level.", this->params())  | 
346  | 4  |     , BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",  | 
347  |  |                   this->params())  | 
348  | 4  |     , BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())  | 
349  | 4  |     , BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",  | 
350  |  |                   this->params())  | 
351  | 4  |     , BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())  | 
352  | 4  |     , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",  | 
353  |  |                   this->params())  | 
354  | 4  |     , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())  | 
355  | 4  |     , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())  | 
356  | 4  |     , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",  | 
357  |  |                  this->params())  | 
358  | 4  |     , STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())  | 
359  | 4  |     , INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())  | 
360  | 4  |     , INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())  | 
361  | 4  |     , BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())  | 
362  | 4  |     , double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())  | 
363  | 4  |     , double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())  | 
364  | 4  |     , BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())  | 
365  | 4  |     , BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())  | 
366  | 4  |     , BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",  | 
367  |  |                   this->params())  | 
368  | 4  |     , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",  | 
369  |  |                   this->params())  | 
370  | 4  |     , INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())  | 
371  | 4  |     , BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())  | 
372  | 4  |     , BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())  | 
373  | 4  |     , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())  | 
374  | 4  |     , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())  | 
375  | 4  |     , BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())  | 
376  | 4  |     , BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())  | 
377  | 4  |     , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())  | 
378  | 4  |     , BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())  | 
379  | 4  |     , BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())  | 
380  | 4  |     , BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())  | 
381  | 4  |     , BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())  | 
382  | 4  |     , BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())  | 
383  | 4  |     , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())  | 
384  | 4  |     , INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())  | 
385  | 4  |     , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())  | 
386  | 4  |     , STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())  | 
387  | 4  |     , INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())  | 
388  | 4  |     , BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())  | 
389  | 4  |     , INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",  | 
390  |  |                  this->params())  | 
391  | 4  |     , BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())  | 
392  | 4  |     , BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())  | 
393  | 4  |     , STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())  | 
394  | 4  |     , BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())  | 
395  | 4  |     , STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",  | 
396  |  |                     this->params())  | 
397  | 4  |     , BOOL_MEMBER(tessedit_use_primary_params_model, false,  | 
398  |  |                   "In multilingual mode use params model of the"  | 
399  |  |                   " primary language",  | 
400  |  |                   this->params())  | 
401  | 4  |     , double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",  | 
402  |  |                     this->params())  | 
403  | 4  |     , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())  | 
404  | 4  |     , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())  | 
405  | 4  |     , BOOL_MEMBER(poly_allow_detailed_fx, false,  | 
406  |  |                   "Allow feature extractors to see the original outline", this->params())  | 
407  | 4  |     , BOOL_INIT_MEMBER(tessedit_init_config_only, false,  | 
408  |  |                        "Only initialize with the config file. Useful if the "  | 
409  |  |                        "instance is not going to be used for OCR but say only "  | 
410  |  |                        "for layout analysis.",  | 
411  |  |                        this->params())  | 
412  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
413  | 4  |     , BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())  | 
414  |  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
415  | 4  |     , BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())  | 
416  | 4  |     , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",  | 
417  |  |                   this->params())  | 
418  | 4  |     , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,  | 
419  |  |                     "Fraction of textlines deemed vertical to use vertical page "  | 
420  |  |                     "mode",  | 
421  |  |                     this->params())  | 
422  | 4  |     , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,  | 
423  |  |                     "Fraction of height used as a minimum gap for aligned blobs.", this->params())  | 
424  | 4  |     , INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())  | 
425  | 4  |     , BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",  | 
426  |  |                   this->params())  | 
427  | 4  |     , STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",  | 
428  |  |                     this->params())  | 
429  | 4  |     , INT_MEMBER(lstm_choice_mode, 0,  | 
430  |  |                  "Allows to include alternative symbols choices in the hOCR output. "  | 
431  |  |                  "Valid input values are 0, 1 and 2. 0 is the default value. "  | 
432  |  |                  "With 1 the alternative symbol choices per timestep are included. "  | 
433  |  |                  "With 2 alternative symbol choices are extracted from the CTC "  | 
434  |  |                  "process instead of the lattice. The choices are mapped per "  | 
435  |  |                  "character.",  | 
436  |  |                  this->params())  | 
437  | 4  |     , INT_MEMBER(lstm_choice_iterations, 5,  | 
438  |  |                  "Sets the number of cascading iterations for the Beamsearch in "  | 
439  |  |                  "lstm_choice_mode. Note that lstm_choice_mode must be set to a "  | 
440  |  |                  "value greater than 0 to produce results.",  | 
441  |  |                  this->params())  | 
442  | 4  |     , double_MEMBER(lstm_rating_coefficient, 5,  | 
443  |  |                     "Sets the rating coefficient for the lstm choices. The smaller the "  | 
444  |  |                     "coefficient, the better are the ratings for each choice and less "  | 
445  |  |                     "information is lost due to the cut off at 0. The standard value is "  | 
446  |  |                     "5",  | 
447  |  |                     this->params())  | 
448  | 4  |     , BOOL_MEMBER(pageseg_apply_music_mask, false,  | 
449  |  |                   "Detect music staff and remove intersecting components", this->params())  | 
450  |  |     ,  | 
451  |  |  | 
452  | 4  |     backup_config_file_(nullptr)  | 
453  | 4  |     , pix_binary_(nullptr)  | 
454  | 4  |     , pix_grey_(nullptr)  | 
455  | 4  |     , pix_original_(nullptr)  | 
456  | 4  |     , pix_thresholds_(nullptr)  | 
457  | 4  |     , source_resolution_(0)  | 
458  | 4  |     , textord_(this)  | 
459  | 4  |     , right_to_left_(false)  | 
460  | 4  |     , scaled_color_(nullptr)  | 
461  | 4  |     , scaled_factor_(-1)  | 
462  | 4  |     , deskew_(1.0f, 0.0f)  | 
463  | 4  |     , reskew_(1.0f, 0.0f)  | 
464  | 4  |     , gradient_(0.0f)  | 
465  | 4  |     , most_recently_used_(this)  | 
466  | 4  |     , font_table_size_(0)  | 
467  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
468  | 4  |     , equ_detect_(nullptr)  | 
469  |  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
470  | 4  |     , lstm_recognizer_(nullptr)  | 
471  | 4  |     , train_line_page_num_(0) {} | 
472  |  |  | 
473  | 0  | Tesseract::~Tesseract() { | 
474  | 0  |   Clear();  | 
475  | 0  |   pix_original_.destroy();  | 
476  | 0  |   end_tesseract();  | 
477  | 0  |   for (auto *lang : sub_langs_) { | 
478  | 0  |     delete lang;  | 
479  | 0  |   }  | 
480  | 0  |   delete lstm_recognizer_;  | 
481  | 0  |   lstm_recognizer_ = nullptr;  | 
482  | 0  | }  | 
483  |  |  | 
484  | 12.3M  | Dict &Tesseract::getDict() { | 
485  | 12.3M  |   if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) { | 
486  | 12.3M  |     if (lstm_recognizer_ && lstm_recognizer_->GetDict()) { | 
487  | 12.3M  |       return *lstm_recognizer_->GetDict();  | 
488  | 12.3M  |     }  | 
489  | 12.3M  |   }  | 
490  | 0  |   return Classify::getDict();  | 
491  | 12.3M  | }  | 
492  |  |  | 
493  | 15.4k  | void Tesseract::Clear() { | 
494  | 15.4k  |   std::string debug_name = imagebasename + "_debug.pdf";  | 
495  | 15.4k  |   pixa_debug_.WritePDF(debug_name.c_str());  | 
496  | 15.4k  |   pix_binary_.destroy();  | 
497  | 15.4k  |   pix_grey_.destroy();  | 
498  | 15.4k  |   pix_thresholds_.destroy();  | 
499  | 15.4k  |   scaled_color_.destroy();  | 
500  | 15.4k  |   deskew_ = FCOORD(1.0f, 0.0f);  | 
501  | 15.4k  |   reskew_ = FCOORD(1.0f, 0.0f);  | 
502  | 15.4k  |   gradient_ = 0.0f;  | 
503  | 15.4k  |   splitter_.Clear();  | 
504  | 15.4k  |   scaled_factor_ = -1;  | 
505  | 15.4k  |   for (auto &sub_lang : sub_langs_) { | 
506  | 0  |     sub_lang->Clear();  | 
507  | 0  |   }  | 
508  | 15.4k  | }  | 
509  |  |  | 
510  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
511  |  |  | 
512  | 0  | void Tesseract::SetEquationDetect(EquationDetect *detector) { | 
513  | 0  |   equ_detect_ = detector;  | 
514  | 0  |   equ_detect_->SetLangTesseract(this);  | 
515  | 0  | }  | 
516  |  |  | 
517  |  | // Clear all memory of adaption for this and all subclassifiers.  | 
518  | 0  | void Tesseract::ResetAdaptiveClassifier() { | 
519  | 0  |   ResetAdaptiveClassifierInternal();  | 
520  | 0  |   for (auto &sub_lang : sub_langs_) { | 
521  | 0  |     sub_lang->ResetAdaptiveClassifierInternal();  | 
522  | 0  |   }  | 
523  | 0  | }  | 
524  |  |  | 
525  |  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
526  |  |  | 
527  |  | // Clear the document dictionary for this and all subclassifiers.  | 
528  | 0  | void Tesseract::ResetDocumentDictionary() { | 
529  | 0  |   getDict().ResetDocumentDictionary();  | 
530  | 0  |   for (auto &sub_lang : sub_langs_) { | 
531  | 0  |     sub_lang->getDict().ResetDocumentDictionary();  | 
532  | 0  |   }  | 
533  | 0  | }  | 
534  |  |  | 
535  | 13.9k  | void Tesseract::SetBlackAndWhitelist() { | 
536  |  |   // Set the white and blacklists (if any)  | 
537  | 13.9k  |   unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),  | 
538  | 13.9k  |                                      tessedit_char_whitelist.c_str(),  | 
539  | 13.9k  |                                      tessedit_char_unblacklist.c_str());  | 
540  | 13.9k  |   if (lstm_recognizer_) { | 
541  | 13.9k  |     UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();  | 
542  | 13.9k  |     lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),  | 
543  | 13.9k  |                                             tessedit_char_whitelist.c_str(),  | 
544  | 13.9k  |                                             tessedit_char_unblacklist.c_str());  | 
545  | 13.9k  |   }  | 
546  |  |   // Black and white lists should apply to all loaded classifiers.  | 
547  | 13.9k  |   for (auto &sub_lang : sub_langs_) { | 
548  | 0  |     sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),  | 
549  | 0  |                                                  tessedit_char_whitelist.c_str(),  | 
550  | 0  |                                                  tessedit_char_unblacklist.c_str());  | 
551  | 0  |     if (sub_lang->lstm_recognizer_) { | 
552  | 0  |       UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();  | 
553  | 0  |       lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),  | 
554  | 0  |                                               tessedit_char_whitelist.c_str(),  | 
555  | 0  |                                               tessedit_char_unblacklist.c_str());  | 
556  | 0  |     }  | 
557  | 0  |   }  | 
558  | 13.9k  | }  | 
559  |  |  | 
560  |  | // Perform steps to prepare underlying binary image/other data structures for  | 
561  |  | // page segmentation.  | 
562  | 15.4k  | void Tesseract::PrepareForPageseg() { | 
563  | 15.4k  |   textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);  | 
564  |  |   // Find the max splitter strategy over all langs.  | 
565  | 15.4k  |   auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(  | 
566  | 15.4k  |       static_cast<int32_t>(pageseg_devanagari_split_strategy));  | 
567  | 15.4k  |   for (auto &sub_lang : sub_langs_) { | 
568  | 0  |     auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(  | 
569  | 0  |         static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));  | 
570  | 0  |     if (pageseg_strategy > max_pageseg_strategy) { | 
571  | 0  |       max_pageseg_strategy = pageseg_strategy;  | 
572  | 0  |     }  | 
573  | 0  |     sub_lang->pix_binary_.destroy();  | 
574  | 0  |     sub_lang->pix_binary_ = pix_binary().clone();  | 
575  | 0  |   }  | 
576  |  |   // Perform shiro-rekha (top-line) splitting and replace the current image by  | 
577  |  |   // the newly split image.  | 
578  | 15.4k  |   splitter_.set_orig_pix(pix_binary());  | 
579  | 15.4k  |   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);  | 
580  | 15.4k  |   if (splitter_.Split(true, &pixa_debug_)) { | 
581  | 0  |     ASSERT_HOST(splitter_.splitted_image());  | 
582  | 0  |     pix_binary_.destroy();  | 
583  | 0  |     pix_binary_ = splitter_.splitted_image().clone();  | 
584  | 0  |   }  | 
585  | 15.4k  | }  | 
586  |  |  | 
587  |  | // Perform steps to prepare underlying binary image/other data structures for  | 
588  |  | // OCR. The current segmentation is required by this method.  | 
589  |  | // Note that this method resets pix_binary_ to the original binarized image,  | 
590  |  | // which may be different from the image actually used for OCR depending on the  | 
591  |  | // value of devanagari_ocr_split_strategy.  | 
592  | 15.4k  | void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) { | 
593  |  |   // Find the max splitter strategy over all langs.  | 
594  | 15.4k  |   auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(  | 
595  | 15.4k  |       static_cast<int32_t>(ocr_devanagari_split_strategy));  | 
596  | 15.4k  |   for (auto &sub_lang : sub_langs_) { | 
597  | 0  |     auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(  | 
598  | 0  |         static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));  | 
599  | 0  |     if (ocr_strategy > max_ocr_strategy) { | 
600  | 0  |       max_ocr_strategy = ocr_strategy;  | 
601  | 0  |     }  | 
602  | 0  |   }  | 
603  |  |   // Utilize the segmentation information available.  | 
604  | 15.4k  |   splitter_.set_segmentation_block_list(block_list);  | 
605  | 15.4k  |   splitter_.set_ocr_split_strategy(max_ocr_strategy);  | 
606  |  |   // Run the splitter for OCR  | 
607  | 15.4k  |   bool split_for_ocr = splitter_.Split(false, &pixa_debug_);  | 
608  |  |   // Restore pix_binary to the binarized original pix for future reference.  | 
609  | 15.4k  |   ASSERT_HOST(splitter_.orig_pix());  | 
610  | 15.4k  |   pix_binary_.destroy();  | 
611  | 15.4k  |   pix_binary_ = splitter_.orig_pix().clone();  | 
612  |  |   // If the pageseg and ocr strategies are different, refresh the block list  | 
613  |  |   // (from the last SegmentImage call) with blobs from the real image to be used  | 
614  |  |   // for OCR.  | 
615  | 15.4k  |   if (splitter_.HasDifferentSplitStrategies()) { | 
616  | 0  |     BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_)); | 
617  | 0  |     Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();  | 
618  | 0  |     extract_edges(pix_for_ocr, &block);  | 
619  | 0  |     splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());  | 
620  | 0  |   }  | 
621  |  |   // The splitter isn't needed any more after this, so save memory by clearing.  | 
622  | 15.4k  |   splitter_.Clear();  | 
623  | 15.4k  | }  | 
624  |  |  | 
625  |  | } // namespace tesseract  |