/src/tesseract/src/wordrec/chopper.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: chopper.cpp (Formerly chopper.c) |
4 | | * Author: Mark Seaman, OCR Technology |
5 | | * |
6 | | * (c) Copyright 1987, Hewlett-Packard Company. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | *****************************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "blamer.h" // for BlamerBundle, IRR_CORRECT |
25 | | #include "blobs.h" // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob |
26 | | #include "dict.h" // for Dict |
27 | | #include "lm_pain_points.h" // for LMPainPoints |
28 | | #include "lm_state.h" // for BestChoiceBundle |
29 | | #include "matrix.h" // for MATRIX |
30 | | #include "normalis.h" // for DENORM |
31 | | #include "pageres.h" // for WERD_RES |
32 | | #include "params.h" // for IntParam, BoolParam |
33 | | #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ... |
34 | | #include "rect.h" // for TBOX |
35 | | #include "render.h" // for display_blob |
36 | | #include "seam.h" // for SEAM |
37 | | #include "split.h" // for remove_edgept |
38 | | #include "stopper.h" // for DANGERR |
39 | | #include "tprintf.h" // for tprintf |
40 | | #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only) |
41 | | |
42 | | namespace tesseract { |
43 | | |
44 | | // Even though the limit on the number of chunks may now be removed, keep |
45 | | // the same limit for repeatable behavior, and it may be a speed advantage. |
46 | | static const int kMaxNumChunks = 64; |
47 | | |
48 | | /*---------------------------------------------------------------------- |
49 | | F u n c t i o n s |
50 | | ----------------------------------------------------------------------*/ |
51 | | |
52 | | /** |
53 | | * @name check_blob |
54 | | * |
55 | | * @return true if blob has a non whole outline. |
56 | | */ |
57 | 209k | static int check_blob(TBLOB *blob) { |
58 | 209k | TESSLINE *outline; |
59 | 209k | EDGEPT *edgept; |
60 | | |
61 | 582k | for (outline = blob->outlines; outline != nullptr; outline = outline->next) { |
62 | 372k | edgept = outline->loop; |
63 | 2.40M | do { |
64 | 2.40M | if (edgept == nullptr) { |
65 | 0 | break; |
66 | 0 | } |
67 | 2.40M | edgept = edgept->next; |
68 | 2.40M | } while (edgept != outline->loop); |
69 | 372k | if (edgept == nullptr) { |
70 | 0 | return 1; |
71 | 0 | } |
72 | 372k | } |
73 | 209k | return 0; |
74 | 209k | } |
75 | | |
76 | | /** |
77 | | * @name any_shared_split_points |
78 | | * |
79 | | * Return true if any of the splits share a point with this one. |
80 | | */ |
81 | 201k | static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) { |
82 | 201k | int length; |
83 | 201k | int index; |
84 | | |
85 | 201k | length = seams.size(); |
86 | 2.89M | for (index = 0; index < length; index++) { |
87 | 2.69M | if (seam->SharesPosition(*seams[index])) { |
88 | 5.13k | return true; |
89 | 5.13k | } |
90 | 2.69M | } |
91 | 196k | return false; |
92 | 201k | } |
93 | | |
94 | | /** |
95 | | * @name preserve_outline_tree |
96 | | * |
97 | | * Copy the list of outlines. |
98 | | */ |
99 | 2.02M | static void preserve_outline(EDGEPT *start) { |
100 | 2.02M | EDGEPT *srcpt; |
101 | | |
102 | 2.02M | if (start == nullptr) { |
103 | 0 | return; |
104 | 0 | } |
105 | 2.02M | srcpt = start; |
106 | 13.7M | do { |
107 | 13.7M | srcpt->runlength = 1; |
108 | 13.7M | srcpt = srcpt->next; |
109 | 13.7M | } while (srcpt != start); |
110 | 2.02M | srcpt->runlength = 2; |
111 | 2.02M | } |
112 | | |
113 | 1.07M | static void preserve_outline_tree(TESSLINE *srcline) { |
114 | 1.07M | TESSLINE *outline; |
115 | | |
116 | 3.09M | for (outline = srcline; outline != nullptr; outline = outline->next) { |
117 | 2.02M | preserve_outline(outline->loop); |
118 | 2.02M | } |
119 | 1.07M | } |
120 | | |
121 | | /** |
122 | | * @name restore_outline_tree |
123 | | * |
124 | | * Copy the list of outlines. |
125 | | */ |
126 | 1.82M | static EDGEPT *restore_outline(EDGEPT *start) { |
127 | 1.82M | EDGEPT *srcpt; |
128 | 1.82M | EDGEPT *real_start; |
129 | | |
130 | 1.82M | if (start == nullptr) { |
131 | 0 | return nullptr; |
132 | 0 | } |
133 | 1.82M | srcpt = start; |
134 | 2.51M | do { |
135 | 2.51M | if (srcpt->runlength == 2) { |
136 | 1.82M | break; |
137 | 1.82M | } |
138 | 693k | srcpt = srcpt->next; |
139 | 693k | } while (srcpt != start); |
140 | 0 | real_start = srcpt; |
141 | 11.7M | do { |
142 | 11.7M | srcpt = srcpt->next; |
143 | 11.7M | if (srcpt->prev->runlength == 0) { |
144 | 34.7k | remove_edgept(srcpt->prev); |
145 | 34.7k | } |
146 | 11.7M | } while (srcpt != real_start); |
147 | 1.82M | return real_start; |
148 | 1.82M | } |
149 | | |
150 | 1.00M | static void restore_outline_tree(TESSLINE *srcline) { |
151 | 1.00M | TESSLINE *outline; |
152 | | |
153 | 2.82M | for (outline = srcline; outline != nullptr; outline = outline->next) { |
154 | 1.82M | outline->loop = restore_outline(outline->loop); |
155 | 1.82M | outline->start = outline->loop->pos; |
156 | 1.82M | } |
157 | 1.00M | } |
158 | | |
159 | | /********************************************************************** |
160 | | * total_containment |
161 | | * |
162 | | * Check to see if one of these outlines is totally contained within |
163 | | * the bounding box of the other. |
164 | | **********************************************************************/ |
165 | 274k | static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) { |
166 | 274k | TBOX box1 = blob1->bounding_box(); |
167 | 274k | TBOX box2 = blob2->bounding_box(); |
168 | 274k | return box1.contains(box2) || box2.contains(box1); |
169 | 274k | } |
170 | | |
171 | | // Helper runs all the checks on a seam to make sure it is valid. |
172 | | // Returns the seam if OK, otherwise deletes the seam and returns nullptr. |
173 | | static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob, |
174 | 1.23M | TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) { |
175 | 1.23M | if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr || |
176 | 1.23M | total_containment(blob, other_blob) || check_blob(other_blob) || |
177 | 1.23M | !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || |
178 | 1.23M | any_shared_split_points(seams, seam) || |
179 | 1.23M | !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { |
180 | 1.04M | word->blobs.erase(word->blobs.begin() + blob_number + 1); |
181 | 1.04M | if (seam) { |
182 | 84.8k | seam->UndoSeam(blob, other_blob); |
183 | 84.8k | delete seam; |
184 | 84.8k | seam = nullptr; |
185 | | #ifndef GRAPHICS_DISABLED |
186 | | if (debug_level) { |
187 | | if (debug_level > 2) { |
188 | | display_blob(blob, ScrollView::RED); |
189 | | } |
190 | | tprintf("\n** seam being removed ** \n"); |
191 | | } |
192 | | #endif |
193 | 956k | } else { |
194 | 956k | delete other_blob; |
195 | 956k | } |
196 | 1.04M | return nullptr; |
197 | 1.04M | } |
198 | 196k | return seam; |
199 | 1.23M | } |
200 | | |
201 | | /** |
202 | | * @name attempt_blob_chop |
203 | | * |
204 | | * Try to split the this blob after this one. Check to make sure that |
205 | | * it was successful. |
206 | | */ |
207 | | SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, |
208 | 1.07M | const std::vector<SEAM *> &seams) { |
209 | 1.07M | if (repair_unchopped_blobs) { |
210 | 1.07M | preserve_outline_tree(blob->outlines); |
211 | 1.07M | } |
212 | 1.07M | TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ |
213 | | // Insert it into the word. |
214 | 1.07M | word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); |
215 | | |
216 | 1.07M | SEAM *seam = nullptr; |
217 | 1.07M | if (prioritize_division) { |
218 | 0 | TPOINT location; |
219 | 0 | if (divisible_blob(blob, italic_blob, &location)) { |
220 | 0 | seam = new SEAM(0.0f, location); |
221 | 0 | } |
222 | 0 | } |
223 | 1.07M | if (seam == nullptr) { |
224 | 1.07M | seam = pick_good_seam(blob); |
225 | 1.07M | } |
226 | 1.07M | if (chop_debug) { |
227 | 0 | if (seam != nullptr) { |
228 | 0 | seam->Print("Good seam picked="); |
229 | 0 | } else { |
230 | 0 | tprintf("\n** no seam picked *** \n"); |
231 | 0 | } |
232 | 0 | } |
233 | 1.07M | if (seam) { |
234 | 113k | seam->ApplySeam(italic_blob, blob, other_blob); |
235 | 113k | } |
236 | | |
237 | 1.07M | seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); |
238 | 1.07M | if (seam == nullptr) { |
239 | 1.00M | if (repair_unchopped_blobs) { |
240 | 1.00M | restore_outline_tree(blob->outlines); |
241 | 1.00M | } |
242 | 1.00M | if (allow_blob_division && !prioritize_division) { |
243 | | // If the blob can simply be divided into outlines, then do that. |
244 | 1.00M | TPOINT location; |
245 | 1.00M | if (divisible_blob(blob, italic_blob, &location)) { |
246 | 167k | other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ |
247 | 167k | word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); |
248 | 167k | seam = new SEAM(0.0f, location); |
249 | 167k | seam->ApplySeam(italic_blob, blob, other_blob); |
250 | 167k | seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); |
251 | 167k | } |
252 | 1.00M | } |
253 | 1.00M | } |
254 | 1.07M | if (seam != nullptr) { |
255 | | // Make sure this seam doesn't get chopped again. |
256 | 196k | seam->Finalize(); |
257 | 196k | } |
258 | 1.07M | return seam; |
259 | 1.07M | } |
260 | | |
261 | | SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, |
262 | 1.07M | const std::vector<SEAM *> &seams) { |
263 | 1.07M | return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams); |
264 | 1.07M | } |
265 | | |
266 | | SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, |
267 | 0 | WERD_RES *word_res, unsigned *blob_number) { |
268 | 0 | TWERD *word = word_res->chopped_word; |
269 | 0 | for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) { |
270 | 0 | TBLOB *blob = word->blobs[*blob_number]; |
271 | 0 | TPOINT topleft, botright; |
272 | 0 | topleft.x = blob->bounding_box().left(); |
273 | 0 | topleft.y = blob->bounding_box().top(); |
274 | 0 | botright.x = blob->bounding_box().right(); |
275 | 0 | botright.y = blob->bounding_box().bottom(); |
276 | |
|
277 | 0 | TPOINT original_topleft, original_botright; |
278 | 0 | word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft); |
279 | 0 | word_res->denorm.DenormTransform(nullptr, botright, &original_botright); |
280 | |
|
281 | 0 | TBOX original_box = |
282 | 0 | TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y); |
283 | |
|
284 | 0 | bool almost_equal_box = false; |
285 | 0 | int num_overlap = 0; |
286 | 0 | for (auto &&boxe : boxes) { |
287 | 0 | if (original_box.overlap_fraction(boxe) > 0.125) { |
288 | 0 | num_overlap++; |
289 | 0 | } |
290 | 0 | if (original_box.almost_equal(boxe, 3)) { |
291 | 0 | almost_equal_box = true; |
292 | 0 | } |
293 | 0 | } |
294 | |
|
295 | 0 | TPOINT location; |
296 | 0 | if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) { |
297 | 0 | SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array); |
298 | 0 | if (seam != nullptr) { |
299 | 0 | return seam; |
300 | 0 | } |
301 | 0 | } |
302 | 0 | } |
303 | | |
304 | 0 | *blob_number = UINT_MAX; |
305 | 0 | return nullptr; |
306 | 0 | } |
307 | | |
308 | | /** |
309 | | * @name improve_one_blob |
310 | | * |
311 | | * Finds the best place to chop, based on the worst blob, fixpt, or next to |
312 | | * a fragment, according to the input. Returns the SEAM corresponding to the |
313 | | * chop point, if any is found, and the index in the ratings_matrix of the |
314 | | * chopped blob. Note that blob_choices is just a copy of the pointers in the |
315 | | * leading diagonal of the ratings MATRIX. |
316 | | * Although the blob is chopped, the returned SEAM is yet to be inserted into |
317 | | * word->seam_array and the resulting blobs are unclassified, so this function |
318 | | * can be used by ApplyBox as well as during recognition. |
319 | | */ |
320 | | SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, |
321 | | bool split_next_to_fragment, bool italic_blob, WERD_RES *word, |
322 | 337k | unsigned *blob_number) { |
323 | 337k | float rating_ceiling = FLT_MAX; |
324 | 337k | SEAM *seam = nullptr; |
325 | 1.21M | do { |
326 | 1.21M | auto blob = select_blob_to_split_from_fixpt(fixpt); |
327 | 1.21M | if (chop_debug) { |
328 | 0 | tprintf("blob_number from fixpt = %d\n", blob); |
329 | 0 | } |
330 | 1.21M | bool split_point_from_dict = (blob != -1); |
331 | 1.21M | if (split_point_from_dict) { |
332 | 0 | fixpt->clear(); |
333 | 1.21M | } else { |
334 | 1.21M | blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment); |
335 | 1.21M | } |
336 | 1.21M | if (chop_debug) { |
337 | 0 | tprintf("blob_number = %d\n", blob); |
338 | 0 | } |
339 | 1.21M | *blob_number = blob; |
340 | 1.21M | if (blob == -1) { |
341 | 140k | return nullptr; |
342 | 140k | } |
343 | | |
344 | | // TODO(rays) it may eventually help to allow italic_blob to be true, |
345 | 1.07M | seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, word->seam_array); |
346 | 1.07M | if (seam != nullptr) { |
347 | 196k | break; // Success! |
348 | 196k | } |
349 | 873k | if (blob_choices[*blob_number] == nullptr) { |
350 | 0 | return nullptr; |
351 | 0 | } |
352 | 873k | if (!split_point_from_dict) { |
353 | | // We chopped the worst rated blob, try something else next time. |
354 | 873k | rating_ceiling = blob_choices[*blob_number]->rating(); |
355 | 873k | } |
356 | 873k | } while (true); |
357 | 196k | return seam; |
358 | 337k | } |
359 | | |
360 | | /** |
361 | | * @name chop_one_blob |
362 | | * |
363 | | * Start with the current one-blob word and its classification. Find |
364 | | * the worst blobs and try to divide it up to improve the ratings. |
365 | | * Used for testing chopper. |
366 | | */ |
367 | | SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes, |
368 | | const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, |
369 | 0 | unsigned *blob_number) { |
370 | 0 | if (prioritize_division) { |
371 | 0 | return chop_overlapping_blob(boxes, true, word_res, blob_number); |
372 | 0 | } else { |
373 | 0 | return improve_one_blob(blob_choices, nullptr, false, true, word_res, blob_number); |
374 | 0 | } |
375 | 0 | } |
376 | | |
377 | | /** |
378 | | * @name chop_word_main |
379 | | * |
380 | | * Classify the blobs in this word and permute the results. Find the |
381 | | * worst blob in the word and chop it up. Continue this process until |
382 | | * a good answer has been found or all the blobs have been chopped up |
383 | | * enough. The results are returned in the WERD_RES. |
384 | | */ |
385 | 144k | void Wordrec::chop_word_main(WERD_RES *word) { |
386 | 144k | int num_blobs = word->chopped_word->NumBlobs(); |
387 | 144k | if (word->ratings == nullptr) { |
388 | 27.8k | word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks); |
389 | 27.8k | } |
390 | 144k | if (word->ratings->get(0, 0) == nullptr) { |
391 | | // Run initial classification. |
392 | 724k | for (int b = 0; b < num_blobs; ++b) { |
393 | 580k | BLOB_CHOICE_LIST *choices = classify_piece( |
394 | 580k | word->seam_array, b, b, "Initial:", word->chopped_word, word->blamer_bundle); |
395 | 580k | word->ratings->put(b, b, choices); |
396 | 580k | } |
397 | 144k | } else { |
398 | | // Blobs have been pre-classified. Set matrix cell for all blob choices |
399 | 0 | for (int col = 0; col < word->ratings->dimension(); ++col) { |
400 | 0 | for (int row = col; |
401 | 0 | row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) { |
402 | 0 | BLOB_CHOICE_LIST *choices = word->ratings->get(col, row); |
403 | 0 | if (choices != nullptr) { |
404 | 0 | BLOB_CHOICE_IT bc_it(choices); |
405 | 0 | for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { |
406 | 0 | bc_it.data()->set_matrix_cell(col, row); |
407 | 0 | } |
408 | 0 | } |
409 | 0 | } |
410 | 0 | } |
411 | 0 | } |
412 | | |
413 | | // Run Segmentation Search. |
414 | 144k | BestChoiceBundle best_choice_bundle(word->ratings->dimension()); |
415 | 144k | SegSearch(word, &best_choice_bundle, word->blamer_bundle); |
416 | | |
417 | 144k | if (word->best_choice == nullptr) { |
418 | | // SegSearch found no valid paths, so just use the leading diagonal. |
419 | 0 | word->FakeWordFromRatings(TOP_CHOICE_PERM); |
420 | 0 | } |
421 | 144k | word->RebuildBestState(); |
422 | | // If we finished without a hyphen at the end of the word, let the next word |
423 | | // be found in the dictionary. |
424 | 144k | if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) { |
425 | 118k | getDict().reset_hyphen_vars(true); |
426 | 118k | } |
427 | | |
428 | 144k | if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) { |
429 | 0 | CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle); |
430 | 0 | } |
431 | 144k | if (wordrec_debug_level > 0) { |
432 | 0 | tprintf("Final Ratings Matrix:\n"); |
433 | 0 | word->ratings->print(getDict().getUnicharset()); |
434 | 0 | } |
435 | 144k | word->FilterWordChoices(getDict().stopper_debug_level); |
436 | 144k | } |
437 | | |
438 | | /** |
439 | | * @name improve_by_chopping |
440 | | * |
441 | | * Repeatedly chops the worst blob, classifying the new blobs fixing up all |
442 | | * the data, and incrementally runs the segmentation search until a good word |
443 | | * is found, or no more chops can be found. |
444 | | */ |
445 | | void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, |
446 | | BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, |
447 | | LMPainPoints *pain_points, |
448 | 140k | std::vector<SegSearchPending> *pending) { |
449 | 140k | unsigned blob_number; |
450 | 337k | do { // improvement loop. |
451 | | // Make a simple vector of BLOB_CHOICEs to make it easy to pick which |
452 | | // one to chop. |
453 | 337k | std::vector<BLOB_CHOICE *> blob_choices; |
454 | 337k | int num_blobs = word->ratings->dimension(); |
455 | 3.89M | for (int i = 0; i < num_blobs; ++i) { |
456 | 3.55M | BLOB_CHOICE_LIST *choices = word->ratings->get(i, i); |
457 | 3.55M | if (choices == nullptr || choices->empty()) { |
458 | 0 | blob_choices.push_back(nullptr); |
459 | 3.55M | } else { |
460 | 3.55M | BLOB_CHOICE_IT bc_it(choices); |
461 | 3.55M | blob_choices.push_back(bc_it.data()); |
462 | 3.55M | } |
463 | 3.55M | } |
464 | 337k | SEAM *seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, |
465 | 337k | &blob_number); |
466 | 337k | if (seam == nullptr) { |
467 | 140k | break; |
468 | 140k | } |
469 | | // A chop has been made. We have to correct all the data structures to |
470 | | // take into account the extra bottom-level blob. |
471 | | // Put the seam into the seam_array and correct everything else on the |
472 | | // word: ratings matrix (including matrix location in the BLOB_CHOICES), |
473 | | // states in WERD_CHOICEs, and blob widths. |
474 | 196k | word->InsertSeam(blob_number, seam); |
475 | | // Insert a new entry in the beam array. |
476 | 196k | best_choice_bundle->beam.insert(best_choice_bundle->beam.begin() + blob_number, new LanguageModelState); |
477 | | // Fixpts are outdated, but will get recalculated. |
478 | 196k | best_choice_bundle->fixpt.clear(); |
479 | | // Remap existing pain points. |
480 | 196k | pain_points->RemapForSplit(blob_number); |
481 | | // Insert a new pending at the chop point. |
482 | 196k | pending->insert(pending->begin() + blob_number, SegSearchPending()); |
483 | | |
484 | | // Classify the two newly created blobs using ProcessSegSearchPainPoint, |
485 | | // as that updates the pending correctly and adds new pain points. |
486 | 196k | MATRIX_COORD pain_point(blob_number, blob_number); |
487 | 196k | ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); |
488 | 196k | pain_point.col = blob_number + 1; |
489 | 196k | pain_point.row = blob_number + 1; |
490 | 196k | ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); |
491 | 196k | if (language_model_->language_model_ngram_on) { |
492 | | // N-gram evaluation depends on the number of blobs in a chunk, so we |
493 | | // have to re-evaluate everything in the word. |
494 | 0 | ResetNGramSearch(word, best_choice_bundle, *pending); |
495 | 0 | blob_number = 0; |
496 | 0 | } |
497 | | // Run language model incrementally. (Except with the n-gram model on.) |
498 | 196k | UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, |
499 | 196k | best_choice_bundle, blamer_bundle); |
500 | 196k | } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); |
501 | | |
502 | | // If after running only the chopper best_choice is incorrect and no blame |
503 | | // has been yet set, blame the classifier if best_choice is classifier's |
504 | | // top choice and is a dictionary word (i.e. language model could not have |
505 | | // helped). Otherwise blame the tradeoff between the classifier and |
506 | | // the old language model (permuters). |
507 | 140k | if (word->blamer_bundle != nullptr && |
508 | 140k | word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && |
509 | 140k | !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { |
510 | 0 | bool valid_permuter = word->best_choice != nullptr && |
511 | 0 | Dict::valid_word_permuter(word->best_choice->permuter(), false); |
512 | 0 | word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, |
513 | 0 | wordrec_debug_blamer); |
514 | 0 | } |
515 | 140k | } |
516 | | |
517 | | /********************************************************************** |
518 | | * select_blob_to_split |
519 | | * |
520 | | * These are the results of the last classification. Find a likely |
521 | | * place to apply splits. If none, return -1. |
522 | | **********************************************************************/ |
523 | | int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, |
524 | 1.21M | float rating_ceiling, bool split_next_to_fragment) { |
525 | 1.21M | BLOB_CHOICE *blob_choice; |
526 | 1.21M | float worst = -FLT_MAX; |
527 | 1.21M | int worst_index = -1; |
528 | 1.21M | float worst_near_fragment = -FLT_MAX; |
529 | 1.21M | int worst_index_near_fragment = -1; |
530 | 1.21M | std::vector<const CHAR_FRAGMENT *> fragments; |
531 | | |
532 | 1.21M | if (chop_debug) { |
533 | 0 | if (rating_ceiling < FLT_MAX) { |
534 | 0 | tprintf("rating_ceiling = %8.4f\n", rating_ceiling); |
535 | 0 | } else { |
536 | 0 | tprintf("rating_ceiling = No Limit\n"); |
537 | 0 | } |
538 | 0 | } |
539 | | |
540 | 1.21M | if (split_next_to_fragment && blob_choices.size() > 0) { |
541 | 0 | fragments.resize(blob_choices.size()); |
542 | 0 | if (blob_choices[0] != nullptr) { |
543 | 0 | fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id()); |
544 | 0 | } else { |
545 | 0 | fragments[0] = nullptr; |
546 | 0 | } |
547 | 0 | } |
548 | | |
549 | 20.7M | for (unsigned x = 0; x < blob_choices.size(); ++x) { |
550 | 19.5M | if (blob_choices[x] == nullptr) { |
551 | 0 | return x; |
552 | 19.5M | } else { |
553 | 19.5M | blob_choice = blob_choices[x]; |
554 | | // Populate fragments for the following position. |
555 | 19.5M | if (split_next_to_fragment && x + 1 < blob_choices.size()) { |
556 | 0 | if (blob_choices[x + 1] != nullptr) { |
557 | 0 | fragments[x + 1] = |
558 | 0 | getDict().getUnicharset().get_fragment(blob_choices[x + 1]->unichar_id()); |
559 | 0 | } else { |
560 | 0 | fragments[x + 1] = nullptr; |
561 | 0 | } |
562 | 0 | } |
563 | 19.5M | if (blob_choice->rating() < rating_ceiling && |
564 | 19.5M | blob_choice->certainty() < tessedit_certainty_threshold) { |
565 | | // Update worst and worst_index. |
566 | 13.3M | if (blob_choice->rating() > worst) { |
567 | 2.83M | worst_index = x; |
568 | 2.83M | worst = blob_choice->rating(); |
569 | 2.83M | } |
570 | 13.3M | if (split_next_to_fragment) { |
571 | | // Update worst_near_fragment and worst_index_near_fragment. |
572 | 0 | bool expand_following_fragment = |
573 | 0 | (x + 1 < blob_choices.size() && fragments[x + 1] != nullptr && |
574 | 0 | !fragments[x + 1]->is_beginning()); |
575 | 0 | bool expand_preceding_fragment = |
576 | 0 | (x > 0 && fragments[x - 1] != nullptr && !fragments[x - 1]->is_ending()); |
577 | 0 | if ((expand_following_fragment || expand_preceding_fragment) && |
578 | 0 | blob_choice->rating() > worst_near_fragment) { |
579 | 0 | worst_index_near_fragment = x; |
580 | 0 | worst_near_fragment = blob_choice->rating(); |
581 | 0 | if (chop_debug) { |
582 | 0 | tprintf( |
583 | 0 | "worst_index_near_fragment=%d" |
584 | 0 | " expand_following_fragment=%d" |
585 | 0 | " expand_preceding_fragment=%d\n", |
586 | 0 | worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); |
587 | 0 | } |
588 | 0 | } |
589 | 0 | } |
590 | 13.3M | } |
591 | 19.5M | } |
592 | 19.5M | } |
593 | | // TODO(daria): maybe a threshold of badness for |
594 | | // worst_near_fragment would be useful. |
595 | 1.21M | return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; |
596 | 1.21M | } |
597 | | |
598 | | /********************************************************************** |
599 | | * select_blob_to_split_from_fixpt |
600 | | * |
601 | | * Given the fix point from a dictionary search, if there is a single |
602 | | * dangerous blob that maps to multiple characters, return that blob |
603 | | * index as a place we need to split. If none, return -1. |
604 | | **********************************************************************/ |
605 | 1.21M | int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) { |
606 | 1.21M | if (!fixpt) { |
607 | 0 | return -1; |
608 | 0 | } |
609 | 2.60M | for (auto &i : *fixpt) { |
610 | 2.60M | if (i.begin + 1 == i.end && i.dangerous && i.correct_is_ngram) { |
611 | 0 | return i.begin; |
612 | 0 | } |
613 | 2.60M | } |
614 | 1.21M | return -1; |
615 | 1.21M | } |
616 | | |
617 | | } // namespace tesseract |