/src/tesseract/src/ccstruct/werd.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | * File: werd.cpp (Formerly word.c) |
3 | | * Description: Code for the WERD class. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1991, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "werd.h" |
25 | | |
26 | | #include "linlsq.h" |
27 | | |
28 | | #include "helpers.h" |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | #define FIRST_COLOUR ScrollView::RED ///< first rainbow colour |
33 | | #define LAST_COLOUR ScrollView::AQUAMARINE ///< last rainbow colour |
34 | | #define CHILD_COLOUR ScrollView::BROWN ///< colour of children |
35 | | |
36 | | /** |
37 | | * WERD::WERD |
38 | | * |
39 | | * Constructor to build a WERD from a list of C_BLOBs. |
40 | | * blob_list The C_BLOBs (in word order) are not copied; |
41 | | * we take its elements and put them in our lists. |
42 | | * blank_count blanks in front of the word |
43 | | * text correct text, outlives this WERD |
44 | | */ |
45 | | WERD::WERD(C_BLOB_LIST *blob_list, uint8_t blank_count, const char *text) |
46 | 395k | : blanks(blank_count), flags(0), script_id_(0), correct(text ? text : "") { |
47 | 395k | C_BLOB_IT start_it = &cblobs; |
48 | 395k | C_BLOB_IT rej_cblob_it = &rej_cblobs; |
49 | 395k | C_OUTLINE_IT c_outline_it; |
50 | 395k | int16_t inverted_vote = 0; |
51 | 395k | int16_t non_inverted_vote = 0; |
52 | | |
53 | | // Move blob_list's elements into cblobs. |
54 | 395k | start_it.add_list_after(blob_list); |
55 | | |
56 | | /* |
57 | | Set white on black flag for the WERD, moving any duff blobs onto the |
58 | | rej_cblobs list. |
59 | | First, walk the cblobs checking the inverse flag for each outline of each |
60 | | cblob. If a cblob has inconsistent flag settings for its different |
61 | | outlines, move the blob to the reject list. Otherwise, increment the |
62 | | appropriate w-on-b or b-on-w vote for the word. |
63 | | |
64 | | Now set the inversion flag for the WERD by maximum vote. |
65 | | |
66 | | Walk the blobs again, moving any blob whose inversion flag does not agree |
67 | | with the concencus onto the reject list. |
68 | | */ |
69 | 395k | start_it.set_to_list(&cblobs); |
70 | 395k | if (start_it.empty()) { |
71 | 0 | return; |
72 | 0 | } |
73 | 1.75M | for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { |
74 | 1.35M | bool reject_blob = false; |
75 | 1.35M | bool blob_inverted; |
76 | | |
77 | 1.35M | c_outline_it.set_to_list(start_it.data()->out_list()); |
78 | 1.35M | blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); |
79 | 3.79M | for (c_outline_it.mark_cycle_pt(); !c_outline_it.cycled_list() && !reject_blob; |
80 | 2.44M | c_outline_it.forward()) { |
81 | 2.44M | reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; |
82 | 2.44M | } |
83 | 1.35M | if (reject_blob) { |
84 | 11.4k | rej_cblob_it.add_after_then_move(start_it.extract()); |
85 | 1.34M | } else { |
86 | 1.34M | if (blob_inverted) { |
87 | 383k | inverted_vote++; |
88 | 960k | } else { |
89 | 960k | non_inverted_vote++; |
90 | 960k | } |
91 | 1.34M | } |
92 | 1.35M | } |
93 | | |
94 | 395k | flags.set(W_INVERSE, (inverted_vote > non_inverted_vote)); |
95 | | |
96 | 395k | start_it.set_to_list(&cblobs); |
97 | 395k | if (start_it.empty()) { |
98 | 1.04k | return; |
99 | 1.04k | } |
100 | 1.73M | for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { |
101 | 1.34M | c_outline_it.set_to_list(start_it.data()->out_list()); |
102 | 1.34M | if (c_outline_it.data()->flag(COUT_INVERSE) != flags[W_INVERSE]) { |
103 | 21.1k | rej_cblob_it.add_after_then_move(start_it.extract()); |
104 | 21.1k | } |
105 | 1.34M | } |
106 | 394k | } |
107 | | |
108 | | /** |
109 | | * WERD::WERD |
110 | | * |
111 | | * Constructor to build a WERD from a list of C_BLOBs. |
112 | | * The C_BLOBs are not copied so the source list is emptied. |
113 | | */ |
114 | | |
115 | | WERD::WERD(C_BLOB_LIST *blob_list, ///< In word order |
116 | | WERD *clone) ///< Source of flags |
117 | 0 | : flags(clone->flags), script_id_(clone->script_id_), correct(clone->correct) { |
118 | 0 | C_BLOB_IT start_it = blob_list; // iterator |
119 | 0 | C_BLOB_IT end_it = blob_list; // another |
120 | |
|
121 | 0 | while (!end_it.at_last()) { |
122 | 0 | end_it.forward(); // move to last |
123 | 0 | } |
124 | 0 | cblobs.assign_to_sublist(&start_it, &end_it); |
125 | | // move to our list |
126 | 0 | blanks = clone->blanks; |
127 | | // fprintf(stderr,"Wrong constructor!!!!\n"); |
128 | 0 | } |
129 | | |
130 | | // Construct a WERD from a single_blob and clone the flags from this. |
131 | | // W_BOL and W_EOL flags are set according to the given values. |
132 | 0 | WERD *WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob) { |
133 | 0 | C_BLOB_LIST temp_blobs; |
134 | 0 | C_BLOB_IT temp_it(&temp_blobs); |
135 | 0 | temp_it.add_after_then_move(blob); |
136 | 0 | WERD *blob_word = new WERD(&temp_blobs, this); |
137 | 0 | blob_word->set_flag(W_BOL, bol); |
138 | 0 | blob_word->set_flag(W_EOL, eol); |
139 | 0 | return blob_word; |
140 | 0 | } |
141 | | |
142 | | /** |
143 | | * WERD::bounding_box |
144 | | * |
145 | | * Return the bounding box of the WERD. |
146 | | * This is quite a mess to compute! |
147 | | * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the |
148 | | * words on the row were re-sorted. The original words were built with reject |
149 | | * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the |
150 | | * blobs in a word are rejected the BB for the word is nullptr, causing the sort |
151 | | * to screw up, leading to the erroneous possibility of the first word in a |
152 | | * row being marked as FUZZY space. |
153 | | */ |
154 | | |
155 | 3.37M | TBOX WERD::bounding_box() const { |
156 | 3.37M | return restricted_bounding_box(true, true); |
157 | 3.37M | } |
158 | | |
159 | | // Returns the bounding box including the desired combination of upper and |
160 | | // lower noise/diacritic elements. |
161 | 3.37M | TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { |
162 | 3.37M | TBOX box = true_bounding_box(); |
163 | 3.37M | int bottom = box.bottom(); |
164 | 3.37M | int top = box.top(); |
165 | | // This is a read-only iteration of the rejected blobs. |
166 | 3.37M | C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&rej_cblobs)); |
167 | 4.56M | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
168 | 1.18M | TBOX dot_box = it.data()->bounding_box(); |
169 | 1.18M | if ((upper_dots || dot_box.bottom() <= top) && (lower_dots || dot_box.top() >= bottom)) { |
170 | 1.18M | box += dot_box; |
171 | 1.18M | } |
172 | 1.18M | } |
173 | 3.37M | return box; |
174 | 3.37M | } |
175 | | |
176 | | // Returns the bounding box of only the good blobs. |
177 | 3.37M | TBOX WERD::true_bounding_box() const { |
178 | 3.37M | TBOX box; // box being built |
179 | | // This is a read-only iteration of the good blobs. |
180 | 3.37M | C_BLOB_IT it(const_cast<C_BLOB_LIST *>(&cblobs)); |
181 | 55.3M | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
182 | 51.9M | box += it.data()->bounding_box(); |
183 | 51.9M | } |
184 | 3.37M | return box; |
185 | 3.37M | } |
186 | | |
187 | | /** |
188 | | * WERD::move |
189 | | * |
190 | | * Reposition WERD by vector |
191 | | * NOTE!! REJECT CBLOBS ARE NOT MOVED |
192 | | */ |
193 | | |
194 | 0 | void WERD::move(const ICOORD vec) { |
195 | 0 | C_BLOB_IT cblob_it(&cblobs); // cblob iterator |
196 | |
|
197 | 0 | for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { |
198 | 0 | cblob_it.data()->move(vec); |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | | /** |
203 | | * WERD::join_on |
204 | | * |
205 | | * Join other word onto this one. Delete the old word. |
206 | | */ |
207 | | |
208 | 0 | void WERD::join_on(WERD *other) { |
209 | 0 | C_BLOB_IT blob_it(&cblobs); |
210 | 0 | C_BLOB_IT src_it(&other->cblobs); |
211 | 0 | C_BLOB_IT rej_cblob_it(&rej_cblobs); |
212 | 0 | C_BLOB_IT src_rej_it(&other->rej_cblobs); |
213 | |
|
214 | 0 | while (!src_it.empty()) { |
215 | 0 | blob_it.add_to_end(src_it.extract()); |
216 | 0 | src_it.forward(); |
217 | 0 | } |
218 | 0 | while (!src_rej_it.empty()) { |
219 | 0 | rej_cblob_it.add_to_end(src_rej_it.extract()); |
220 | 0 | src_rej_it.forward(); |
221 | 0 | } |
222 | 0 | } |
223 | | |
224 | | /** |
225 | | * WERD::copy_on |
226 | | * |
227 | | * Copy blobs from other word onto this one. |
228 | | */ |
229 | | |
230 | 39.6k | void WERD::copy_on(WERD *other) { |
231 | 39.6k | bool reversed = other->bounding_box().left() < bounding_box().left(); |
232 | 39.6k | C_BLOB_IT c_blob_it(&cblobs); |
233 | 39.6k | C_BLOB_LIST c_blobs; |
234 | | |
235 | 39.6k | c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); |
236 | 39.6k | if (reversed) { |
237 | 0 | c_blob_it.add_list_before(&c_blobs); |
238 | 39.6k | } else { |
239 | 39.6k | c_blob_it.move_to_last(); |
240 | 39.6k | c_blob_it.add_list_after(&c_blobs); |
241 | 39.6k | } |
242 | 39.6k | if (!other->rej_cblobs.empty()) { |
243 | 3.44k | C_BLOB_IT rej_c_blob_it(&rej_cblobs); |
244 | 3.44k | C_BLOB_LIST new_rej_c_blobs; |
245 | | |
246 | 3.44k | new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); |
247 | 3.44k | if (reversed) { |
248 | 0 | rej_c_blob_it.add_list_before(&new_rej_c_blobs); |
249 | 3.44k | } else { |
250 | 3.44k | rej_c_blob_it.move_to_last(); |
251 | 3.44k | rej_c_blob_it.add_list_after(&new_rej_c_blobs); |
252 | 3.44k | } |
253 | 3.44k | } |
254 | 39.6k | } |
255 | | |
256 | | /** |
257 | | * WERD::print |
258 | | * |
259 | | * Display members |
260 | | */ |
261 | | |
262 | 0 | void WERD::print() const { |
263 | 0 | tprintf("Blanks= %d\n", blanks); |
264 | 0 | bounding_box().print(); |
265 | 0 | tprintf("Flags = %lu = 0%lo\n", flags.to_ulong(), flags.to_ulong()); |
266 | 0 | tprintf(" W_SEGMENTED = %s\n", flags[W_SEGMENTED] ? "TRUE" : "FALSE"); |
267 | 0 | tprintf(" W_ITALIC = %s\n", flags[W_ITALIC] ? "TRUE" : "FALSE"); |
268 | 0 | tprintf(" W_BOL = %s\n", flags[W_BOL] ? "TRUE" : "FALSE"); |
269 | 0 | tprintf(" W_EOL = %s\n", flags[W_EOL] ? "TRUE" : "FALSE"); |
270 | 0 | tprintf(" W_NORMALIZED = %s\n", flags[W_NORMALIZED] ? "TRUE" : "FALSE"); |
271 | 0 | tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", flags[W_SCRIPT_HAS_XHEIGHT] ? "TRUE" : "FALSE"); |
272 | 0 | tprintf(" W_SCRIPT_IS_LATIN = %s\n", flags[W_SCRIPT_IS_LATIN] ? "TRUE" : "FALSE"); |
273 | 0 | tprintf(" W_DONT_CHOP = %s\n", flags[W_DONT_CHOP] ? "TRUE" : "FALSE"); |
274 | 0 | tprintf(" W_REP_CHAR = %s\n", flags[W_REP_CHAR] ? "TRUE" : "FALSE"); |
275 | 0 | tprintf(" W_FUZZY_SP = %s\n", flags[W_FUZZY_SP] ? "TRUE" : "FALSE"); |
276 | 0 | tprintf(" W_FUZZY_NON = %s\n", flags[W_FUZZY_NON] ? "TRUE" : "FALSE"); |
277 | 0 | tprintf("Correct= %s\n", correct.c_str()); |
278 | 0 | tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); |
279 | 0 | tprintf("Script = %d\n", script_id_); |
280 | 0 | } |
281 | | |
282 | | /** |
283 | | * WERD::plot |
284 | | * |
285 | | * Draw the WERD in the given colour. |
286 | | */ |
287 | | |
288 | | #ifndef GRAPHICS_DISABLED |
289 | | void WERD::plot(ScrollView *window, ScrollView::Color colour) { |
290 | | C_BLOB_IT it = &cblobs; |
291 | | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
292 | | it.data()->plot(window, colour, colour); |
293 | | } |
294 | | plot_rej_blobs(window); |
295 | | } |
296 | | |
297 | | // Get the next color in the (looping) rainbow. |
298 | | ScrollView::Color WERD::NextColor(ScrollView::Color colour) { |
299 | | auto next = static_cast<ScrollView::Color>(colour + 1); |
300 | | if (next >= LAST_COLOUR || next < FIRST_COLOUR) { |
301 | | next = FIRST_COLOUR; |
302 | | } |
303 | | return next; |
304 | | } |
305 | | |
306 | | /** |
307 | | * WERD::plot |
308 | | * |
309 | | * Draw the WERD in rainbow colours in window. |
310 | | */ |
311 | | |
312 | | void WERD::plot(ScrollView *window) { |
313 | | ScrollView::Color colour = FIRST_COLOUR; |
314 | | C_BLOB_IT it = &cblobs; |
315 | | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
316 | | it.data()->plot(window, colour, CHILD_COLOUR); |
317 | | colour = NextColor(colour); |
318 | | } |
319 | | plot_rej_blobs(window); |
320 | | } |
321 | | |
322 | | /** |
323 | | * WERD::plot_rej_blobs |
324 | | * |
325 | | * Draw the WERD rejected blobs in window - ALWAYS GREY |
326 | | */ |
327 | | |
328 | | void WERD::plot_rej_blobs(ScrollView *window) { |
329 | | C_BLOB_IT it = &rej_cblobs; |
330 | | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
331 | | it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); |
332 | | } |
333 | | } |
334 | | #endif // !GRAPHICS_DISABLED |
335 | | |
336 | | /** |
337 | | * WERD::shallow_copy() |
338 | | * |
339 | | * Make a shallow copy of a word |
340 | | */ |
341 | | |
342 | 0 | WERD *WERD::shallow_copy() { |
343 | 0 | WERD *new_word = new WERD; |
344 | |
|
345 | 0 | new_word->blanks = blanks; |
346 | 0 | new_word->flags = flags; |
347 | 0 | new_word->correct = correct; |
348 | 0 | return new_word; |
349 | 0 | } |
350 | | |
351 | | /** |
352 | | * WERD::operator= |
353 | | * |
354 | | * Assign a word, DEEP copying the blob list |
355 | | */ |
356 | | |
357 | 119k | WERD &WERD::operator=(const WERD &source) { |
358 | 119k | this->ELIST2<WERD>::LINK::operator=(source); |
359 | 119k | blanks = source.blanks; |
360 | 119k | flags = source.flags; |
361 | 119k | script_id_ = source.script_id_; |
362 | 119k | correct = source.correct; |
363 | 119k | cblobs.clear(); |
364 | 119k | cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); |
365 | 119k | rej_cblobs.clear(); |
366 | 119k | rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); |
367 | 119k | return *this; |
368 | 119k | } |
369 | | |
370 | | /** |
371 | | * word_comparator() |
372 | | * |
373 | | * word comparator used to sort a word list so that words are in increasing |
374 | | * order of left edge. |
375 | | */ |
376 | | |
377 | 0 | int word_comparator(const WERD *word1, const WERD *word2) { |
378 | 0 | return word1->bounding_box().left() - word2->bounding_box().left(); |
379 | 0 | } |
380 | | |
381 | | /** |
382 | | * WERD::ConstructWerdWithNewBlobs() |
383 | | * |
384 | | * This method returns a new werd constructed using the blobs in the input |
385 | | * all_blobs list, which correspond to the blobs in this werd object. The |
386 | | * blobs used to construct the new word are consumed and removed from the |
387 | | * input all_blobs list. |
388 | | * Returns nullptr if the word couldn't be constructed. |
389 | | * Returns original blobs for which no matches were found in the output list |
390 | | * orphan_blobs (appends). |
391 | | */ |
392 | | |
393 | 0 | WERD *WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs) { |
394 | 0 | C_BLOB_LIST current_blob_list; |
395 | 0 | C_BLOB_IT werd_blobs_it(¤t_blob_list); |
396 | | // Add the word's c_blobs. |
397 | 0 | werd_blobs_it.add_list_after(cblob_list()); |
398 | | |
399 | | // New blob list. These contain the blobs which will form the new word. |
400 | 0 | C_BLOB_LIST new_werd_blobs; |
401 | 0 | C_BLOB_IT new_blobs_it(&new_werd_blobs); |
402 | | |
403 | | // not_found_blobs contains the list of current word's blobs for which a |
404 | | // corresponding blob wasn't found in the input all_blobs list. |
405 | 0 | C_BLOB_LIST not_found_blobs; |
406 | 0 | C_BLOB_IT not_found_it(¬_found_blobs); |
407 | 0 | not_found_it.move_to_last(); |
408 | |
|
409 | 0 | werd_blobs_it.move_to_first(); |
410 | 0 | for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); werd_blobs_it.forward()) { |
411 | 0 | C_BLOB *werd_blob = werd_blobs_it.extract(); |
412 | 0 | TBOX werd_blob_box = werd_blob->bounding_box(); |
413 | 0 | bool found = false; |
414 | | // Now find the corresponding blob for this blob in the all_blobs |
415 | | // list. For now, follow the inefficient method of pairwise |
416 | | // comparisons. Ideally, one can pre-bucket the blobs by row. |
417 | 0 | C_BLOB_IT all_blobs_it(all_blobs); |
418 | 0 | for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) { |
419 | 0 | C_BLOB *a_blob = all_blobs_it.data(); |
420 | | // Compute the overlap of the two blobs. If major, a_blob should |
421 | | // be added to the new blobs list. |
422 | 0 | TBOX a_blob_box = a_blob->bounding_box(); |
423 | 0 | if (a_blob_box.null_box()) { |
424 | 0 | tprintf("Bounding box couldn't be ascertained\n"); |
425 | 0 | } |
426 | 0 | if (werd_blob_box.contains(a_blob_box) || werd_blob_box.major_overlap(a_blob_box)) { |
427 | | // Old blobs are from minimal splits, therefore are expected to be |
428 | | // bigger. The new small blobs should cover a significant portion. |
429 | | // This is it. |
430 | 0 | all_blobs_it.extract(); |
431 | 0 | new_blobs_it.add_after_then_move(a_blob); |
432 | 0 | found = true; |
433 | 0 | } |
434 | 0 | } |
435 | 0 | if (!found) { |
436 | 0 | not_found_it.add_after_then_move(werd_blob); |
437 | 0 | } else { |
438 | 0 | delete werd_blob; |
439 | 0 | } |
440 | 0 | } |
441 | | // Iterate over all not found blobs. Some of them may be due to |
442 | | // under-segmentation (which is OK, since the corresponding blob is already |
443 | | // in the list in that case. |
444 | 0 | not_found_it.move_to_first(); |
445 | 0 | for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) { |
446 | 0 | C_BLOB *not_found = not_found_it.data(); |
447 | 0 | TBOX not_found_box = not_found->bounding_box(); |
448 | 0 | C_BLOB_IT existing_blobs_it(new_blobs_it); |
449 | 0 | for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); |
450 | 0 | existing_blobs_it.forward()) { |
451 | 0 | C_BLOB *a_blob = existing_blobs_it.data(); |
452 | 0 | TBOX a_blob_box = a_blob->bounding_box(); |
453 | 0 | if ((not_found_box.major_overlap(a_blob_box) || a_blob_box.major_overlap(not_found_box)) && |
454 | 0 | not_found_box.y_overlap_fraction(a_blob_box) > 0.8) { |
455 | | // Already taken care of. |
456 | 0 | delete not_found_it.extract(); |
457 | 0 | break; |
458 | 0 | } |
459 | 0 | } |
460 | 0 | } |
461 | 0 | if (orphan_blobs) { |
462 | 0 | C_BLOB_IT orphan_blobs_it(orphan_blobs); |
463 | 0 | orphan_blobs_it.move_to_last(); |
464 | 0 | orphan_blobs_it.add_list_after(¬_found_blobs); |
465 | 0 | } |
466 | | |
467 | | // New blobs are ready. Create a new werd object with these. |
468 | 0 | WERD *new_werd = nullptr; |
469 | 0 | if (!new_werd_blobs.empty()) { |
470 | 0 | new_werd = new WERD(&new_werd_blobs, this); |
471 | 0 | } else { |
472 | | // Add the blobs back to this word so that it can be reused. |
473 | 0 | C_BLOB_IT this_list_it(cblob_list()); |
474 | 0 | this_list_it.add_list_after(¬_found_blobs); |
475 | 0 | } |
476 | 0 | return new_werd; |
477 | 0 | } |
478 | | |
479 | | // Removes noise from the word by moving small outlines to the rej_cblobs |
480 | | // list, based on the size_threshold. |
481 | 3.79k | void WERD::CleanNoise(float size_threshold) { |
482 | 3.79k | C_BLOB_IT blob_it(&cblobs); |
483 | 3.79k | C_BLOB_IT rej_it(&rej_cblobs); |
484 | 26.6k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
485 | 22.8k | C_BLOB *blob = blob_it.data(); |
486 | 22.8k | C_OUTLINE_IT ol_it(blob->out_list()); |
487 | 135k | for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { |
488 | 112k | C_OUTLINE *outline = ol_it.data(); |
489 | 112k | TBOX ol_box = outline->bounding_box(); |
490 | 112k | int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); |
491 | 112k | if (ol_size < size_threshold) { |
492 | | // This outline is too small. Move it to a separate blob in the |
493 | | // reject blobs list. |
494 | 96.5k | auto *rej_blob = new C_BLOB(ol_it.extract()); |
495 | 96.5k | rej_it.add_after_then_move(rej_blob); |
496 | 96.5k | } |
497 | 112k | } |
498 | 22.8k | if (blob->out_list()->empty()) { |
499 | 13.3k | delete blob_it.extract(); |
500 | 13.3k | } |
501 | 22.8k | } |
502 | 3.79k | } |
503 | | |
504 | | // Extracts all the noise outlines and stuffs the pointers into the given |
505 | | // vector of outlines. Afterwards, the outlines vector owns the pointers. |
506 | 0 | void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) { |
507 | 0 | C_BLOB_IT rej_it(&rej_cblobs); |
508 | 0 | for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { |
509 | 0 | C_BLOB *blob = rej_it.extract(); |
510 | 0 | C_OUTLINE_IT ol_it(blob->out_list()); |
511 | 0 | outlines->push_back(ol_it.extract()); |
512 | 0 | delete blob; |
513 | 0 | } |
514 | 0 | } |
515 | | |
516 | | // Adds the selected outlines to the indicated real blobs, and puts the rest |
517 | | // back in rej_cblobs where they came from. Where the target_blobs entry is |
518 | | // nullptr, a run of wanted outlines is put into a single new blob. |
519 | | // Ownership of the outlines is transferred back to the word. (Hence |
520 | | // vector and not PointerVector.) |
521 | | // Returns true if any new blob was added to the start of the word, which |
522 | | // suggests that it might need joining to the word before it, and likewise |
523 | | // sets make_next_word_fuzzy true if any new blob was added to the end. |
524 | | bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted, |
525 | | const std::vector<C_BLOB *> &target_blobs, |
526 | | const std::vector<C_OUTLINE *> &outlines, |
527 | 0 | bool *make_next_word_fuzzy) { |
528 | 0 | bool outline_added_to_start = false; |
529 | 0 | if (make_next_word_fuzzy != nullptr) { |
530 | 0 | *make_next_word_fuzzy = false; |
531 | 0 | } |
532 | 0 | C_BLOB_IT rej_it(&rej_cblobs); |
533 | 0 | for (unsigned i = 0; i < outlines.size(); ++i) { |
534 | 0 | C_OUTLINE *outline = outlines[i]; |
535 | 0 | if (outline == nullptr) { |
536 | 0 | continue; // Already used it. |
537 | 0 | } |
538 | 0 | if (wanted[i]) { |
539 | 0 | C_BLOB *target_blob = target_blobs[i]; |
540 | 0 | TBOX noise_box = outline->bounding_box(); |
541 | 0 | if (target_blob == nullptr) { |
542 | 0 | target_blob = new C_BLOB(outline); |
543 | | // Need to find the insertion point. |
544 | 0 | C_BLOB_IT blob_it(&cblobs); |
545 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
546 | 0 | C_BLOB *blob = blob_it.data(); |
547 | 0 | TBOX blob_box = blob->bounding_box(); |
548 | 0 | if (blob_box.left() > noise_box.left()) { |
549 | 0 | if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { |
550 | | // We might want to join this word to its predecessor. |
551 | 0 | outline_added_to_start = true; |
552 | 0 | } |
553 | 0 | blob_it.add_before_stay_put(target_blob); |
554 | 0 | break; |
555 | 0 | } |
556 | 0 | } |
557 | 0 | if (blob_it.cycled_list()) { |
558 | 0 | blob_it.add_to_end(target_blob); |
559 | 0 | if (make_next_word_fuzzy != nullptr) { |
560 | 0 | *make_next_word_fuzzy = true; |
561 | 0 | } |
562 | 0 | } |
563 | | // Add all consecutive wanted, but null-blob outlines to same blob. |
564 | 0 | C_OUTLINE_IT ol_it(target_blob->out_list()); |
565 | 0 | while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { |
566 | 0 | ++i; |
567 | 0 | ol_it.add_to_end(outlines[i]); |
568 | 0 | } |
569 | 0 | } else { |
570 | | // Insert outline into this blob. |
571 | 0 | C_OUTLINE_IT ol_it(target_blob->out_list()); |
572 | 0 | ol_it.add_to_end(outline); |
573 | 0 | } |
574 | 0 | } else { |
575 | | // Put back on noise list. |
576 | 0 | rej_it.add_to_end(new C_BLOB(outline)); |
577 | 0 | } |
578 | 0 | } |
579 | 0 | return outline_added_to_start; |
580 | 0 | } |
581 | | |
582 | | } // namespace tesseract |