/src/tesseract/src/wordrec/wordrec.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: wordrec.h |
3 | | // Description: wordrec class. |
4 | | // Author: Samuel Charron |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TESSERACT_WORDREC_WORDREC_H_ |
20 | | #define TESSERACT_WORDREC_WORDREC_H_ |
21 | | |
22 | | #ifdef HAVE_CONFIG_H |
23 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
24 | | #endif |
25 | | |
26 | | #ifdef DISABLED_LEGACY_ENGINE |
27 | | |
28 | | # include <cstdint> // for int16_t, int32_t |
29 | | # include "classify.h" // for Classify |
30 | | # include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP... |
31 | | # include "ratngs.h" // for WERD_CHOICE |
32 | | |
33 | | namespace tesseract { |
34 | | class TessdataManager; |
35 | | } |
36 | | |
37 | | namespace tesseract { |
38 | | |
39 | | /* ccmain/tstruct.cpp */ |
40 | | |
41 | | class TESS_API Wordrec : public Classify { |
42 | | public: |
43 | | // config parameters |
44 | | |
45 | | BOOL_VAR_H(wordrec_debug_blamer); |
46 | | BOOL_VAR_H(wordrec_run_blamer); |
47 | | |
48 | | // methods |
49 | | Wordrec(); |
50 | | virtual ~Wordrec() = default; |
51 | | |
52 | | // tface.cpp |
53 | | void program_editup(const std::string &textbase, TessdataManager *init_classifier, |
54 | | TessdataManager *init_dict); |
55 | | void program_editdown(int32_t elapsed_time); |
56 | | int end_recog(); |
57 | | int dict_word(const WERD_CHOICE &word); |
58 | | |
59 | | // Member variables |
60 | | WERD_CHOICE *prev_word_best_choice_; |
61 | | }; |
62 | | |
63 | | } // namespace tesseract |
64 | | |
65 | | #else // DISABLED_LEGACY_ENGINE not defined |
66 | | |
67 | | # include <memory> |
68 | | # include "associate.h" |
69 | | # include "chop.h" // for PointHeap, MAX_NUM_POINTS |
70 | | # include "classify.h" // for Classify |
71 | | # include "dict.h" |
72 | | # include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK |
73 | | # include "findseam.h" // for SeamQueue, SeamPile |
74 | | # include "language_model.h" |
75 | | # include "matrix.h" |
76 | | # include "oldlist.h" // for LIST |
77 | | # include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP... |
78 | | # include "points.h" // for ICOORD |
79 | | # include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI... |
80 | | # include "seam.h" // for SEAM (ptr only), PRIORITY |
81 | | # include "stopper.h" // for DANGERR |
82 | | |
83 | | # include <cstdint> // for int16_t, int32_t |
84 | | |
85 | | namespace tesseract { |
86 | | |
87 | | class EDGEPT_CLIST; |
88 | | class MATRIX; |
89 | | class TBOX; |
90 | | class UNICHARSET; |
91 | | class WERD_RES; |
92 | | |
93 | | class LMPainPoints; |
94 | | class TessdataManager; |
95 | | struct BestChoiceBundle; |
96 | | |
97 | | struct BlamerBundle; |
98 | | struct EDGEPT; |
99 | | struct MATRIX_COORD; |
100 | | struct SPLIT; |
101 | | struct TBLOB; |
102 | | struct TESSLINE; |
103 | | struct TWERD; |
104 | | |
105 | | // A class for storing which nodes are to be processed by the segmentation |
106 | | // search. There is a single SegSearchPending for each column in the ratings |
107 | | // matrix, and it indicates whether the segsearch should combine all |
108 | | // BLOB_CHOICES in the column, or just the given row with the parents |
109 | | // corresponding to *this SegSearchPending, and whether only updated parent |
110 | | // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs. |
111 | | class SegSearchPending { |
112 | | public: |
113 | | SegSearchPending() |
114 | 333k | : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {} |
115 | | |
116 | | // Marks the whole column as just classified. Used to start a search on |
117 | | // a newly initialized ratings matrix. |
118 | 144k | void SetColumnClassified() { |
119 | 144k | column_classified_ = true; |
120 | 144k | } |
121 | | // Marks the matrix entry at the given row as just classified. |
122 | | // Used after classifying a new matrix cell. |
123 | | // Additional to, not overriding a previous RevisitWholeColumn. |
124 | 1.36M | void SetBlobClassified(int row) { |
125 | 1.36M | classified_row_ = row; |
126 | 1.36M | } |
127 | | // Marks the whole column as needing work, but not just classified. |
128 | | // Used when the parent vse list is updated. |
129 | | // Additional to, not overriding a previous SetBlobClassified. |
130 | 2.44M | void RevisitWholeColumn() { |
131 | 2.44M | revisit_whole_column_ = true; |
132 | 2.44M | } |
133 | | |
134 | | // Clears *this to indicate no work to do. |
135 | 17.2M | void Clear() { |
136 | 17.2M | classified_row_ = -1; |
137 | 17.2M | revisit_whole_column_ = false; |
138 | 17.2M | column_classified_ = false; |
139 | 17.2M | } |
140 | | |
141 | | // Returns true if there are updates to do in the column that *this |
142 | | // represents. |
143 | 10.7M | bool WorkToDo() const { |
144 | 10.7M | return revisit_whole_column_ || column_classified_ || classified_row_ >= 0; |
145 | 10.7M | } |
146 | | // Returns true if the given row was just classified. |
147 | 4.47M | bool IsRowJustClassified(int row) const { |
148 | 4.47M | return row == classified_row_ || column_classified_; |
149 | 4.47M | } |
150 | | // Returns the single row to process if there is only one, otherwise -1. |
151 | 4.72M | int SingleRow() const { |
152 | 4.72M | return revisit_whole_column_ || column_classified_ ? -1 : classified_row_; |
153 | 4.72M | } |
154 | | |
155 | | private: |
156 | | // If non-negative, indicates the single row in the ratings matrix that has |
157 | | // just been classified, and so should be combined with all the parents in the |
158 | | // column that this SegSearchPending represents. |
159 | | // Operates independently of revisit_whole_column. |
160 | | int classified_row_; |
161 | | // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will |
162 | | // be processed, but classified_row can indicate a row that is newly |
163 | | // classified. Overridden if column_classified is true. |
164 | | bool revisit_whole_column_; |
165 | | // If column_classified is true, parent vses are processed with all rows |
166 | | // regardless of whether they are just updated, overriding |
167 | | // revisit_whole_column and classified_row. |
168 | | bool column_classified_; |
169 | | }; |
170 | | |
171 | | /* ccmain/tstruct.cpp *********************************************************/ |
172 | | class FRAGMENT : public ELIST<FRAGMENT>::LINK { |
173 | | public: |
174 | 0 | FRAGMENT() { // constructor |
175 | 0 | } |
176 | | FRAGMENT(EDGEPT *head_pt, // start |
177 | | EDGEPT *tail_pt); // end |
178 | | |
179 | | ICOORD head; // coords of start |
180 | | ICOORD tail; // coords of end |
181 | | EDGEPT *headpt; // start point |
182 | | EDGEPT *tailpt; // end point |
183 | | }; |
184 | | ELISTIZEH(FRAGMENT) |
185 | | |
186 | | class TESS_API Wordrec : public Classify { |
187 | | public: |
188 | | // config parameters ******************************************************* |
189 | | BOOL_VAR_H(merge_fragments_in_matrix); |
190 | | BOOL_VAR_H(wordrec_enable_assoc); |
191 | | BOOL_VAR_H(force_word_assoc); |
192 | | INT_VAR_H(repair_unchopped_blobs); |
193 | | double_VAR_H(tessedit_certainty_threshold); |
194 | | INT_VAR_H(chop_debug); |
195 | | BOOL_VAR_H(chop_enable); |
196 | | BOOL_VAR_H(chop_vertical_creep); |
197 | | INT_VAR_H(chop_split_length); |
198 | | INT_VAR_H(chop_same_distance); |
199 | | INT_VAR_H(chop_min_outline_points); |
200 | | INT_VAR_H(chop_seam_pile_size); |
201 | | BOOL_VAR_H(chop_new_seam_pile); |
202 | | INT_VAR_H(chop_inside_angle); |
203 | | INT_VAR_H(chop_min_outline_area); |
204 | | double_VAR_H(chop_split_dist_knob); |
205 | | double_VAR_H(chop_overlap_knob); |
206 | | double_VAR_H(chop_center_knob); |
207 | | INT_VAR_H(chop_centered_maxwidth); |
208 | | double_VAR_H(chop_sharpness_knob); |
209 | | double_VAR_H(chop_width_change_knob); |
210 | | double_VAR_H(chop_ok_split); |
211 | | double_VAR_H(chop_good_split); |
212 | | INT_VAR_H(chop_x_y_weight); |
213 | | BOOL_VAR_H(assume_fixed_pitch_char_segment); |
214 | | INT_VAR_H(wordrec_debug_level); |
215 | | INT_VAR_H(wordrec_max_join_chunks); |
216 | | BOOL_VAR_H(wordrec_skip_no_truth_words); |
217 | | BOOL_VAR_H(wordrec_debug_blamer); |
218 | | BOOL_VAR_H(wordrec_run_blamer); |
219 | | INT_VAR_H(segsearch_debug_level); |
220 | | INT_VAR_H(segsearch_max_pain_points); |
221 | | INT_VAR_H(segsearch_max_futile_classifications); |
222 | | double_VAR_H(segsearch_max_char_wh_ratio); |
223 | | BOOL_VAR_H(save_alt_choices); |
224 | | |
225 | | // methods from wordrec/*.cpp *********************************************** |
226 | | Wordrec(); |
227 | 0 | ~Wordrec() override = default; |
228 | | |
229 | | // Fills word->alt_choices with alternative paths found during |
230 | | // chopping/segmentation search that are kept in best_choices. |
231 | | void SaveAltChoices(const LIST &best_choices, WERD_RES *word); |
232 | | |
233 | | // Fills character choice lattice in the given BlamerBundle |
234 | | // using the given ratings matrix and best choice list. |
235 | | void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, |
236 | | const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); |
237 | | |
238 | | // Calls fill_lattice_ member function |
239 | | // (assumes that fill_lattice_ is not nullptr). |
240 | | void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, |
241 | 0 | const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) { |
242 | 0 | (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle); |
243 | 0 | } |
244 | | |
245 | | // tface.cpp |
246 | | void program_editup(const std::string &textbase, TessdataManager *init_classifier, |
247 | | TessdataManager *init_dict); |
248 | | void cc_recog(WERD_RES *word); |
249 | | void program_editdown(int32_t elapsed_time); |
250 | | void set_pass1(); |
251 | | void set_pass2(); |
252 | | int end_recog(); |
253 | | BLOB_CHOICE_LIST *call_matcher(TBLOB *blob); |
254 | | int dict_word(const WERD_CHOICE &word); |
255 | | // wordclass.cpp |
256 | | BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color, |
257 | | BlamerBundle *blamer_bundle); |
258 | | |
259 | | // segsearch.cpp |
260 | | // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs. |
261 | | // Each entry in the matrix represents the classification choice |
262 | | // for a chunk, i.e. an entry in row 2, column 1 represents the list |
263 | | // of ratings for the chunks 1 and 2 classified as a single blob. |
264 | | // The entries on the diagonal of the matrix are classifier choice lists |
265 | | // for a single chunk from the maximal segmentation. |
266 | | // |
267 | | // The ratings matrix given to SegSearch represents the segmentation |
268 | | // graph / trellis for the current word. The nodes in the graph are the |
269 | | // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings |
270 | | // matrix. The children of each node (nodes connected by outgoing links) |
271 | | // are the entries in the column that is equal to node's row+1. The parents |
272 | | // (nodes connected by the incoming links) are the entries in the row that |
273 | | // is equal to the node's column-1. Here is an example ratings matrix: |
274 | | // |
275 | | // 0 1 2 3 4 |
276 | | // ------------------------- |
277 | | // 0| c,( | |
278 | | // 1| d l,1 | |
279 | | // 2| o | |
280 | | // 3| c,( | |
281 | | // 4| g,y l,1 | |
282 | | // ------------------------- |
283 | | // |
284 | | // In the example above node "o" has children (outgoing connection to nodes) |
285 | | // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d". |
286 | | // |
287 | | // The objective of the search is to find the least cost path, where the cost |
288 | | // is determined by the language model components and the properties of the |
289 | | // cut between the blobs on the path. SegSearch starts by populating the |
290 | | // matrix with the all the entries that were classified by the chopper and |
291 | | // finding the initial best path. Based on the classifier ratings, language |
292 | | // model scores and the properties of each cut, a list of "pain points" is |
293 | | // constructed - those are the points on the path where the choices do not |
294 | | // look consistent with the neighboring choices, the cuts look particularly |
295 | | // problematic, or the certainties of the blobs are low. The most troublesome |
296 | | // "pain point" is picked from the list and the new entry in the ratings |
297 | | // matrix corresponding to this "pain point" is filled in. Then the language |
298 | | // model state is updated to reflect the new classification and the new |
299 | | // "pain points" are added to the list and the next most troublesome |
300 | | // "pain point" is determined. This continues until either the word choice |
301 | | // composed from the best paths in the segmentation graph is "good enough" |
302 | | // (e.g. above a certain certainty threshold, is an unambiguous dictionary |
303 | | // word, etc) or there are no more "pain points" to explore. |
304 | | // |
305 | | // If associate_blobs is set to false no new classifications will be done |
306 | | // to combine blobs. Segmentation search will run only one "iteration" |
307 | | // on the classifications already recorded in chunks_record.ratings. |
308 | | // |
309 | | // Note: this function assumes that word_res, best_choice_bundle arguments |
310 | | // are not nullptr. |
311 | | void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, |
312 | | BlamerBundle *blamer_bundle); |
313 | | |
314 | | // Setup and run just the initial segsearch on an established matrix, |
315 | | // without doing any additional chopping or joining. |
316 | | // (Internal factored version that can be used as part of the main SegSearch.) |
317 | | void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, |
318 | | std::vector<SegSearchPending> *pending, |
319 | | BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); |
320 | | |
321 | | // chop.cpp |
322 | | PRIORITY point_priority(EDGEPT *point); |
323 | | void add_point_to_list(PointHeap *point_heap, EDGEPT *point); |
324 | | // Returns true if the edgept supplied as input is an inside angle. This |
325 | | // is determined by the angular change of the vectors from point to point. |
326 | | bool is_inside_angle(EDGEPT *pt); |
327 | | int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3); |
328 | | EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist); |
329 | | void prioritize_points(TESSLINE *outline, PointHeap *points); |
330 | | void new_min_point(EDGEPT *local_min, PointHeap *points); |
331 | | void new_max_point(EDGEPT *local_max, PointHeap *points); |
332 | | void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, |
333 | | EDGEPT_CLIST *new_points); |
334 | | |
335 | | // chopper.cpp |
336 | | SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, |
337 | | const std::vector<SEAM *> &seams); |
338 | | SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, |
339 | | const std::vector<SEAM *> &seams); |
340 | | SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res, |
341 | | unsigned *blob_number); |
342 | | SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, |
343 | | bool split_next_to_fragment, bool italic_blob, WERD_RES *word, |
344 | | unsigned *blob_number); |
345 | | SEAM *chop_one_blob(const std::vector<TBOX> &boxes, |
346 | | const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, |
347 | | unsigned *blob_number); |
348 | | void chop_word_main(WERD_RES *word); |
349 | | void improve_by_chopping(float rating_cert_scale, WERD_RES *word, |
350 | | BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, |
351 | | LMPainPoints *pain_points, std::vector<SegSearchPending> *pending); |
352 | | int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling, |
353 | | bool split_next_to_fragment); |
354 | | int select_blob_to_split_from_fixpt(DANGERR *fixpt); |
355 | | |
356 | | // findseam.cpp |
357 | | void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams); |
358 | | void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, |
359 | | SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile); |
360 | | void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue); |
361 | | SEAM *pick_good_seam(TBLOB *blob); |
362 | | void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, |
363 | | SeamPile *seam_pile, SEAM **seam, TBLOB *blob); |
364 | | void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, |
365 | | EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, |
366 | | SEAM **seam, TBLOB *blob); |
367 | | |
368 | | // gradechop.cpp |
369 | | PRIORITY grade_split_length(SPLIT *split); |
370 | | PRIORITY grade_sharpness(SPLIT *split); |
371 | | |
372 | | // outlines.cpp |
373 | | bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt); |
374 | | |
375 | | // pieces.cpp |
376 | | virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start, |
377 | | int16_t end, const char *description, TWERD *word, |
378 | | BlamerBundle *blamer_bundle); |
379 | | |
380 | | // Member variables. |
381 | | |
382 | | std::unique_ptr<LanguageModel> language_model_; |
383 | | PRIORITY pass2_ok_split; |
384 | | // Stores the best choice for the previous word in the paragraph. |
385 | | // This variable is modified by PAGE_RES_IT when iterating over |
386 | | // words to OCR on the page. |
387 | | WERD_CHOICE *prev_word_best_choice_; |
388 | | |
389 | | // Function used to fill char choice lattices. |
390 | | void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, |
391 | | const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); |
392 | | |
393 | | protected: |
394 | 2.25M | inline bool SegSearchDone(int num_futile_classifications) { |
395 | 2.25M | return (language_model_->AcceptableChoiceFound() || |
396 | 2.25M | num_futile_classifications >= segsearch_max_futile_classifications); |
397 | 2.25M | } |
398 | | |
399 | | // Updates the language model state recorded for the child entries specified |
400 | | // in pending[starting_col]. Enqueues the children of the updated entries |
401 | | // into pending and proceeds to update (and remove from pending) all the |
402 | | // remaining entries in pending[col] (col >= starting_col). Upon termination |
403 | | // of this function all the pending[col] lists will be empty. |
404 | | // |
405 | | // The arguments: |
406 | | // |
407 | | // starting_col: index of the column in chunks_record->ratings from |
408 | | // which the update should be started |
409 | | // |
410 | | // pending: list of entries listing chunks_record->ratings entries |
411 | | // that should be updated |
412 | | // |
413 | | // pain_points: priority heap listing the pain points generated by |
414 | | // the language model |
415 | | // |
416 | | // temp_pain_points: temporary storage for tentative pain points generated |
417 | | // by the language model after a single call to LanguageModel::UpdateState() |
418 | | // (the argument is passed in rather than created before each |
419 | | // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation) |
420 | | // |
421 | | // best_choice_bundle: a collection of variables that should be updated |
422 | | // if a new best choice is found |
423 | | // |
424 | | void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, |
425 | | std::vector<SegSearchPending> *pending, WERD_RES *word_res, |
426 | | LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, |
427 | | BlamerBundle *blamer_bundle); |
428 | | |
429 | | // Process the given pain point: classify the corresponding blob, enqueue |
430 | | // new pain points to join the newly classified blob with its neighbors. |
431 | | void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, |
432 | | const char *pain_point_type, |
433 | | std::vector<SegSearchPending> *pending, WERD_RES *word_res, |
434 | | LMPainPoints *pain_points, BlamerBundle *blamer_bundle); |
435 | | // Resets enough of the results so that the Viterbi search is re-run. |
436 | | // Needed when the n-gram model is enabled, as the multi-length comparison |
437 | | // implementation will re-value existing paths to worse values. |
438 | | void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, |
439 | | std::vector<SegSearchPending> &pending); |
440 | | |
441 | | // Add pain points for classifying blobs on the correct segmentation path |
442 | | // (so that we can evaluate correct segmentation path and discover the reason |
443 | | // for incorrect result). |
444 | | void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, |
445 | | BlamerBundle *blamer_bundle, std::string &blamer_debug); |
446 | | }; |
447 | | |
448 | | } // namespace tesseract |
449 | | |
450 | | #endif // DISABLED_LEGACY_ENGINE |
451 | | |
452 | | #endif // TESSERACT_WORDREC_WORDREC_H_ |