/src/tesseract/src/ccstruct/ratngs.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: ratngs.cpp (Formerly ratings.c) |
3 | | * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifdef HAVE_CONFIG_H |
20 | | # include "config_auto.h" |
21 | | #endif |
22 | | |
23 | | #include "ratngs.h" |
24 | | |
25 | | #include "blobs.h" |
26 | | #include "matrix.h" |
27 | | #include "normalis.h" // kBlnBaselineOffset. |
28 | | #include "unicharset.h" |
29 | | |
30 | | #include <algorithm> |
31 | | #include <cmath> |
32 | | #include <string> |
33 | | #include <vector> |
34 | | |
35 | | namespace tesseract { |
36 | | |
37 | | const float WERD_CHOICE::kBadRating = 100000.0; |
38 | | // Min offset in baseline-normalized coords to make a character a subscript. |
39 | | const int kMinSubscriptOffset = 20; |
40 | | // Min offset in baseline-normalized coords to make a character a superscript. |
41 | | const int kMinSuperscriptOffset = 20; |
42 | | // Max y of bottom of a drop-cap blob. |
43 | | const int kMaxDropCapBottom = -128; |
44 | | // Max fraction of x-height to use as denominator in measuring x-height overlap. |
45 | | const double kMaxOverlapDenominator = 0.125; |
46 | | // Min fraction of x-height range that should be in agreement for matching |
47 | | // x-heights. |
48 | | const double kMinXHeightMatch = 0.5; |
49 | | // Max tolerance on baseline position as a fraction of x-height for matching |
50 | | // baselines. |
51 | | const double kMaxBaselineDrift = 0.0625; |
52 | | |
53 | | static const char kPermuterTypeNoPerm[] = "None"; |
54 | | static const char kPermuterTypePuncPerm[] = "Punctuation"; |
55 | | static const char kPermuterTypeTopPerm[] = "Top Choice"; |
56 | | static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; |
57 | | static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; |
58 | | static const char kPermuterTypeNgramPerm[] = "Ngram"; |
59 | | static const char kPermuterTypeNumberPerm[] = "Number"; |
60 | | static const char kPermuterTypeUserPatPerm[] = "User Pattern"; |
61 | | static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; |
62 | | static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; |
63 | | static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; |
64 | | static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; |
65 | | static const char kPermuterTypeCompoundPerm[] = "Compound"; |
66 | | |
67 | | static const char *const kPermuterTypeNames[] = { |
68 | | kPermuterTypeNoPerm, // 0 |
69 | | kPermuterTypePuncPerm, // 1 |
70 | | kPermuterTypeTopPerm, // 2 |
71 | | kPermuterTypeLowerPerm, // 3 |
72 | | kPermuterTypeUpperPerm, // 4 |
73 | | kPermuterTypeNgramPerm, // 5 |
74 | | kPermuterTypeNumberPerm, // 6 |
75 | | kPermuterTypeUserPatPerm, // 7 |
76 | | kPermuterTypeSysDawgPerm, // 8 |
77 | | kPermuterTypeDocDawgPerm, // 9 |
78 | | kPermuterTypeUserDawgPerm, // 10 |
79 | | kPermuterTypeFreqDawgPerm, // 11 |
80 | | kPermuterTypeCompoundPerm // 12 |
81 | | }; |
82 | | |
83 | | /** |
84 | | * BLOB_CHOICE::BLOB_CHOICE |
85 | | * |
86 | | * Constructor to build a BLOB_CHOICE from a char, rating and certainty. |
87 | | */ |
88 | | BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id |
89 | | float src_rating, // rating |
90 | | float src_cert, // certainty |
91 | | int src_script_id, // script |
92 | | float min_xheight, // min xheight allowed |
93 | | float max_xheight, // max xheight by this char |
94 | | float yshift, // yshift out of position |
95 | 13.2M | BlobChoiceClassifier c) { // adapted match or other |
96 | 13.2M | unichar_id_ = src_unichar_id; |
97 | 13.2M | rating_ = src_rating; |
98 | 13.2M | certainty_ = src_cert; |
99 | 13.2M | fontinfo_id_ = -1; |
100 | 13.2M | fontinfo_id2_ = -1; |
101 | 13.2M | script_id_ = src_script_id; |
102 | 13.2M | min_xheight_ = min_xheight; |
103 | 13.2M | max_xheight_ = max_xheight; |
104 | 13.2M | yshift_ = yshift; |
105 | 13.2M | classifier_ = c; |
106 | 13.2M | } |
107 | | |
108 | | /** |
109 | | * BLOB_CHOICE::BLOB_CHOICE |
110 | | * |
111 | | * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. |
112 | | */ |
113 | 36.0k | BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST<BLOB_CHOICE>::LINK(other) { |
114 | 36.0k | unichar_id_ = other.unichar_id(); |
115 | 36.0k | rating_ = other.rating(); |
116 | 36.0k | certainty_ = other.certainty(); |
117 | 36.0k | fontinfo_id_ = other.fontinfo_id(); |
118 | 36.0k | fontinfo_id2_ = other.fontinfo_id2(); |
119 | 36.0k | script_id_ = other.script_id(); |
120 | 36.0k | matrix_cell_ = other.matrix_cell_; |
121 | 36.0k | min_xheight_ = other.min_xheight_; |
122 | 36.0k | max_xheight_ = other.max_xheight_; |
123 | 36.0k | yshift_ = other.yshift(); |
124 | 36.0k | classifier_ = other.classifier_; |
125 | 36.0k | #ifndef DISABLED_LEGACY_ENGINE |
126 | 36.0k | fonts_ = other.fonts_; |
127 | 36.0k | #endif // ndef DISABLED_LEGACY_ENGINE |
128 | 36.0k | } |
129 | | |
130 | | // Copy assignment operator. |
131 | 0 | BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) { |
132 | 0 | ELIST<BLOB_CHOICE>::LINK::operator=(other); |
133 | 0 | unichar_id_ = other.unichar_id(); |
134 | 0 | rating_ = other.rating(); |
135 | 0 | certainty_ = other.certainty(); |
136 | 0 | fontinfo_id_ = other.fontinfo_id(); |
137 | 0 | fontinfo_id2_ = other.fontinfo_id2(); |
138 | 0 | script_id_ = other.script_id(); |
139 | 0 | matrix_cell_ = other.matrix_cell_; |
140 | 0 | min_xheight_ = other.min_xheight_; |
141 | 0 | max_xheight_ = other.max_xheight_; |
142 | 0 | yshift_ = other.yshift(); |
143 | 0 | classifier_ = other.classifier_; |
144 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
145 | 0 | fonts_ = other.fonts_; |
146 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
147 | 0 | return *this; |
148 | 0 | } |
149 | | |
150 | | // Returns true if *this and other agree on the baseline and x-height |
151 | | // to within some tolerance based on a given estimate of the x-height. |
152 | 664k | bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const { |
153 | 664k | double baseline_diff = std::fabs(yshift() - other.yshift()); |
154 | 664k | if (baseline_diff > kMaxBaselineDrift * x_height) { |
155 | 417k | if (debug) { |
156 | 0 | tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_); |
157 | 0 | } |
158 | 417k | return false; |
159 | 417k | } |
160 | 247k | double this_range = max_xheight() - min_xheight(); |
161 | 247k | double other_range = other.max_xheight() - other.min_xheight(); |
162 | 247k | double denominator = |
163 | 247k | ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height); |
164 | 247k | double overlap = |
165 | 247k | std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight()); |
166 | 247k | overlap /= denominator; |
167 | 247k | if (debug) { |
168 | 0 | tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_, |
169 | 0 | other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap); |
170 | 0 | } |
171 | | |
172 | 247k | return overlap >= kMinXHeightMatch; |
173 | 664k | } |
174 | | |
175 | | // Helper to find the BLOB_CHOICE in the bc_list that matches the given |
176 | | // unichar_id, or nullptr if there is no match. |
177 | 644k | BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) { |
178 | | // Find the corresponding best BLOB_CHOICE. |
179 | 644k | BLOB_CHOICE_IT choice_it(bc_list); |
180 | 894k | for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { |
181 | 857k | BLOB_CHOICE *choice = choice_it.data(); |
182 | 857k | if (choice->unichar_id() == char_id) { |
183 | 608k | return choice; |
184 | 608k | } |
185 | 857k | } |
186 | 36.4k | return nullptr; |
187 | 644k | } |
188 | | |
189 | 0 | const char *WERD_CHOICE::permuter_name(uint8_t permuter) { |
190 | 0 | return kPermuterTypeNames[permuter]; |
191 | 0 | } |
192 | | |
193 | 0 | const char *ScriptPosToString(enum ScriptPos script_pos) { |
194 | 0 | switch (script_pos) { |
195 | 0 | case SP_NORMAL: |
196 | 0 | return "NORM"; |
197 | 0 | case SP_SUBSCRIPT: |
198 | 0 | return "SUB"; |
199 | 0 | case SP_SUPERSCRIPT: |
200 | 0 | return "SUPER"; |
201 | 0 | case SP_DROPCAP: |
202 | 0 | return "DROPC"; |
203 | 0 | } |
204 | 0 | return "SP_UNKNOWN"; |
205 | 0 | } |
206 | | |
207 | | /** |
208 | | * WERD_CHOICE::WERD_CHOICE |
209 | | * |
210 | | * Constructor to build a WERD_CHOICE from the given string. |
211 | | * The function assumes that src_string is not nullptr. |
212 | | */ |
213 | | WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) |
214 | 0 | : unicharset_(&unicharset) { |
215 | 0 | std::vector<UNICHAR_ID> encoding; |
216 | 0 | std::vector<char> lengths; |
217 | 0 | std::string cleaned = unicharset.CleanupString(src_string); |
218 | 0 | if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) { |
219 | 0 | lengths.push_back('\0'); |
220 | 0 | std::string src_lengths = &lengths[0]; |
221 | 0 | this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM); |
222 | 0 | } else { // There must have been an invalid unichar in the string. |
223 | 0 | this->init(8); |
224 | 0 | this->make_bad(); |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | | /** |
229 | | * WERD_CHOICE::init |
230 | | * |
231 | | * Helper function to build a WERD_CHOICE from the given string, |
232 | | * fragment lengths, rating, certainty and permuter. |
233 | | * |
234 | | * The function assumes that src_string is not nullptr. |
235 | | * src_lengths argument could be nullptr, in which case the unichars |
236 | | * in src_string are assumed to all be of length 1. |
237 | | */ |
238 | | void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating, |
239 | 0 | float src_certainty, uint8_t src_permuter) { |
240 | 0 | int src_string_len = strlen(src_string); |
241 | 0 | if (src_string_len == 0) { |
242 | 0 | this->init(8); |
243 | 0 | } else { |
244 | 0 | this->init(src_lengths ? strlen(src_lengths) : src_string_len); |
245 | 0 | length_ = reserved_; |
246 | 0 | int offset = 0; |
247 | 0 | for (unsigned i = 0; i < length_; ++i) { |
248 | 0 | int unichar_length = src_lengths ? src_lengths[i] : 1; |
249 | 0 | unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length); |
250 | 0 | state_[i] = 1; |
251 | 0 | certainties_[i] = src_certainty; |
252 | 0 | offset += unichar_length; |
253 | 0 | } |
254 | 0 | } |
255 | 0 | adjust_factor_ = 1.0f; |
256 | 0 | rating_ = src_rating; |
257 | 0 | certainty_ = src_certainty; |
258 | 0 | permuter_ = src_permuter; |
259 | 0 | dangerous_ambig_found_ = false; |
260 | 0 | } |
261 | | |
262 | | /** |
263 | | * WERD_CHOICE::~WERD_CHOICE |
264 | | */ |
265 | 2.18M | WERD_CHOICE::~WERD_CHOICE() = default; |
266 | | |
267 | 0 | const char *WERD_CHOICE::permuter_name() const { |
268 | 0 | return kPermuterTypeNames[permuter_]; |
269 | 0 | } |
270 | | |
271 | | // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, |
272 | | // taken from the appropriate cell in the ratings MATRIX. |
273 | | // Borrowed pointer, so do not delete. |
274 | 331k | BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const { |
275 | 331k | MATRIX_COORD coord = MatrixCoord(index); |
276 | 331k | BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row); |
277 | 331k | if (result == nullptr) { |
278 | 320 | result = new BLOB_CHOICE_LIST; |
279 | 320 | ratings->put(coord.col, coord.row, result); |
280 | 320 | } |
281 | 331k | return result; |
282 | 331k | } |
283 | | |
284 | | // Returns the MATRIX_COORD corresponding to the location in the ratings |
285 | | // MATRIX for the given index into the word. |
286 | 332k | MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const { |
287 | 332k | int col = 0; |
288 | 2.47M | for (unsigned i = 0; i < index; ++i) { |
289 | 2.14M | col += state_[i]; |
290 | 2.14M | } |
291 | 332k | int row = col + state_[index] - 1; |
292 | 332k | return MATRIX_COORD(col, row); |
293 | 332k | } |
294 | | |
295 | | // Sets the entries for the given index from the BLOB_CHOICE, assuming |
296 | | // unit fragment lengths, but setting the state for this index to blob_count. |
297 | 2.98M | void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) { |
298 | 2.98M | unichar_ids_[index] = blob_choice->unichar_id(); |
299 | 2.98M | script_pos_[index] = tesseract::SP_NORMAL; |
300 | 2.98M | state_[index] = blob_count; |
301 | 2.98M | certainties_[index] = blob_choice->certainty(); |
302 | 2.98M | } |
303 | | |
304 | | /** |
305 | | * contains_unichar_id |
306 | | * |
307 | | * Returns true if unichar_ids_ contain the given unichar_id, false otherwise. |
308 | | */ |
309 | 0 | bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { |
310 | 0 | for (unsigned i = 0; i < length_; ++i) { |
311 | 0 | if (unichar_ids_[i] == unichar_id) { |
312 | 0 | return true; |
313 | 0 | } |
314 | 0 | } |
315 | 0 | return false; |
316 | 0 | } |
317 | | |
318 | | /** |
319 | | * remove_unichar_ids |
320 | | * |
321 | | * Removes num unichar ids starting from index start from unichar_ids_ |
322 | | * and updates length_ and fragment_lengths_ to reflect this change. |
323 | | * Note: this function does not modify rating_ and certainty_. |
324 | | */ |
325 | 105k | void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) { |
326 | 105k | ASSERT_HOST(start + num <= length_); |
327 | | // Accumulate the states to account for the merged blobs. |
328 | 211k | for (int i = 0; i < num; ++i) { |
329 | 105k | if (start > 0) { |
330 | 105k | state_[start - 1] += state_[start + i]; |
331 | 105k | } else if (start + num < length_) { |
332 | 0 | state_[start + num] += state_[start + i]; |
333 | 0 | } |
334 | 105k | } |
335 | 692k | for (unsigned i = start; i + num < length_; ++i) { |
336 | 587k | unichar_ids_[i] = unichar_ids_[i + num]; |
337 | 587k | script_pos_[i] = script_pos_[i + num]; |
338 | 587k | state_[i] = state_[i + num]; |
339 | 587k | certainties_[i] = certainties_[i + num]; |
340 | 587k | } |
341 | 105k | length_ -= num; |
342 | 105k | } |
343 | | |
344 | | /** |
345 | | * reverse_and_mirror_unichar_ids |
346 | | * |
347 | | * Reverses and mirrors unichars in unichar_ids. |
348 | | */ |
349 | 0 | void WERD_CHOICE::reverse_and_mirror_unichar_ids() { |
350 | 0 | for (unsigned i = 0; i < length_ / 2; ++i) { |
351 | 0 | UNICHAR_ID tmp_id = unichar_ids_[i]; |
352 | 0 | unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]); |
353 | 0 | unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id); |
354 | 0 | } |
355 | 0 | if (length_ % 2 != 0) { |
356 | 0 | unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]); |
357 | 0 | } |
358 | 0 | } |
359 | | |
360 | | /** |
361 | | * punct_stripped |
362 | | * |
363 | | * Returns the half-open interval of unichar_id indices [start, end) which |
364 | | * enclose the core portion of this word -- the part after stripping |
365 | | * punctuation from the left and right. |
366 | | */ |
367 | 0 | void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const { |
368 | 0 | *start = 0; |
369 | 0 | *end = length(); |
370 | 0 | while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) { |
371 | 0 | (*start)++; |
372 | 0 | } |
373 | 0 | while (*end > *start && unicharset()->get_ispunctuation(unichar_id(*end - 1))) { |
374 | 0 | (*end)--; |
375 | 0 | } |
376 | 0 | } |
377 | | |
378 | 0 | void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const { |
379 | 0 | int end = length(); |
380 | 0 | while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) && |
381 | 0 | BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { |
382 | 0 | end--; |
383 | 0 | } |
384 | 0 | int start = 0; |
385 | 0 | while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) && |
386 | 0 | BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { |
387 | 0 | start++; |
388 | 0 | } |
389 | 0 | *pstart = start; |
390 | 0 | *pend = end; |
391 | 0 | } |
392 | | |
393 | 0 | WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const { |
394 | 0 | ASSERT_HOST(start <= length_); |
395 | 0 | ASSERT_HOST(end <= length_); |
396 | 0 | if (end < start) { |
397 | 0 | end = start; |
398 | 0 | } |
399 | 0 | WERD_CHOICE retval(unicharset_, end - start); |
400 | 0 | for (auto i = start; i < end; i++) { |
401 | 0 | retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]); |
402 | 0 | } |
403 | 0 | return retval; |
404 | 0 | } |
405 | | |
406 | | /** |
407 | | * has_rtl_unichar_id |
408 | | * |
409 | | * Returns true if unichar_ids contain at least one "strongly" RTL unichar. |
410 | | */ |
411 | 0 | bool WERD_CHOICE::has_rtl_unichar_id() const { |
412 | 0 | for (unsigned i = 0; i < length_; ++i) { |
413 | 0 | UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); |
414 | 0 | if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { |
415 | 0 | return true; |
416 | 0 | } |
417 | 0 | } |
418 | 0 | return false; |
419 | 0 | } |
420 | | |
421 | | /** |
422 | | * string_and_lengths |
423 | | * |
424 | | * Populates the given word_str with unichars from unichar_ids and |
425 | | * and word_lengths_str with the corresponding unichar lengths. |
426 | | */ |
427 | 3.26M | void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const { |
428 | 3.26M | *word_str = ""; |
429 | 3.26M | if (word_lengths_str != nullptr) { |
430 | 3.26M | *word_lengths_str = ""; |
431 | 3.26M | } |
432 | 34.0M | for (unsigned i = 0; i < length_; ++i) { |
433 | 30.8M | const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); |
434 | 30.8M | *word_str += ch; |
435 | 30.8M | if (word_lengths_str != nullptr) { |
436 | 30.8M | *word_lengths_str += (char)strlen(ch); |
437 | 30.8M | } |
438 | 30.8M | } |
439 | 3.26M | } |
440 | | |
441 | | /** |
442 | | * append_unichar_id |
443 | | * |
444 | | * Make sure there is enough space in the word for the new unichar id |
445 | | * and call append_unichar_id_space_allocated(). |
446 | | */ |
447 | | void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, |
448 | 1.45M | float certainty) { |
449 | 1.45M | if (length_ == reserved_) { |
450 | 41.4k | this->double_the_size(); |
451 | 41.4k | } |
452 | 1.45M | this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty); |
453 | 1.45M | } |
454 | | |
455 | | /** |
456 | | * WERD_CHOICE::operator+= |
457 | | * |
458 | | * Cat a second word rating on the end of this current one. |
459 | | * The ratings are added and the confidence is the min. |
460 | | * If the permuters are NOT the same the permuter is set to COMPOUND_PERM |
461 | | */ |
462 | 280k | WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) { |
463 | 280k | ASSERT_HOST(unicharset_ == second.unicharset_); |
464 | 1.59M | while (reserved_ < length_ + second.length()) { |
465 | 1.31M | this->double_the_size(); |
466 | 1.31M | } |
467 | 280k | const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids(); |
468 | 12.3M | for (unsigned i = 0; i < second.length(); ++i) { |
469 | 12.0M | unichar_ids_[length_ + i] = other_unichar_ids[i]; |
470 | 12.0M | state_[length_ + i] = second.state_[i]; |
471 | 12.0M | certainties_[length_ + i] = second.certainties_[i]; |
472 | 12.0M | script_pos_[length_ + i] = second.BlobPosition(i); |
473 | 12.0M | } |
474 | 280k | length_ += second.length(); |
475 | 280k | if (second.adjust_factor_ > adjust_factor_) { |
476 | 153k | adjust_factor_ = second.adjust_factor_; |
477 | 153k | } |
478 | 280k | rating_ += second.rating(); // add ratings |
479 | 280k | if (second.certainty() < certainty_) { // take min |
480 | 224k | certainty_ = second.certainty(); |
481 | 224k | } |
482 | 280k | if (second.dangerous_ambig_found_) { |
483 | 0 | dangerous_ambig_found_ = true; |
484 | 0 | } |
485 | 280k | if (permuter_ == NO_PERM) { |
486 | 0 | permuter_ = second.permuter(); |
487 | 280k | } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) { |
488 | 0 | permuter_ = COMPOUND_PERM; |
489 | 0 | } |
490 | 280k | return *this; |
491 | 280k | } |
492 | | |
493 | | /** |
494 | | * WERD_CHOICE::operator= |
495 | | * |
496 | | * Allocate enough memory to hold a copy of source and copy over |
497 | | * all the information from source to this WERD_CHOICE. |
498 | | */ |
499 | 598k | WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) { |
500 | 598k | while (reserved_ < source.length()) { |
501 | 0 | this->double_the_size(); |
502 | 0 | } |
503 | | |
504 | 598k | unicharset_ = source.unicharset_; |
505 | 598k | const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids(); |
506 | 3.72M | for (unsigned i = 0; i < source.length(); ++i) { |
507 | 3.12M | unichar_ids_[i] = other_unichar_ids[i]; |
508 | 3.12M | state_[i] = source.state_[i]; |
509 | 3.12M | certainties_[i] = source.certainties_[i]; |
510 | 3.12M | script_pos_[i] = source.BlobPosition(i); |
511 | 3.12M | } |
512 | 598k | length_ = source.length(); |
513 | 598k | adjust_factor_ = source.adjust_factor_; |
514 | 598k | rating_ = source.rating(); |
515 | 598k | certainty_ = source.certainty(); |
516 | 598k | min_x_height_ = source.min_x_height(); |
517 | 598k | max_x_height_ = source.max_x_height(); |
518 | 598k | permuter_ = source.permuter(); |
519 | 598k | dangerous_ambig_found_ = source.dangerous_ambig_found_; |
520 | 598k | return *this; |
521 | 598k | } |
522 | | |
523 | | // Sets up the script_pos_ member using the blobs_list to get the bln |
524 | | // bounding boxes, *this to get the unichars, and this->unicharset |
525 | | // to get the target positions. If small_caps is true, sub/super are not |
526 | | // considered, but dropcaps are. |
527 | | // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.) |
528 | 399k | void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) { |
529 | | // Initialize to normal. |
530 | 3.17M | for (unsigned i = 0; i < length_; ++i) { |
531 | 2.77M | script_pos_[i] = tesseract::SP_NORMAL; |
532 | 2.77M | } |
533 | 399k | if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { |
534 | 0 | return; |
535 | 0 | } |
536 | | |
537 | 399k | unsigned position_counts[4] = {0, 0, 0, 0}; |
538 | | |
539 | 399k | int chunk_index = 0; |
540 | 3.17M | for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { |
541 | 2.77M | TBLOB *tblob = word->blobs[chunk_index]; |
542 | 2.77M | int uni_id = unichar_id(blob_index); |
543 | 2.77M | TBOX blob_box = tblob->bounding_box(); |
544 | 2.77M | if (!state_.empty()) { |
545 | 3.91M | for (int i = 1; i < state_[blob_index]; ++i) { |
546 | 1.14M | ++chunk_index; |
547 | 1.14M | tblob = word->blobs[chunk_index]; |
548 | 1.14M | blob_box += tblob->bounding_box(); |
549 | 1.14M | } |
550 | 2.77M | } |
551 | 2.77M | script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id); |
552 | 2.77M | if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { |
553 | 0 | script_pos_[blob_index] = tesseract::SP_NORMAL; |
554 | 0 | } |
555 | 2.77M | position_counts[script_pos_[blob_index]]++; |
556 | 2.77M | } |
557 | | // If almost everything looks like a superscript or subscript, |
558 | | // we most likely just got the baseline wrong. |
559 | 399k | if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ || |
560 | 399k | 4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) { |
561 | 37.8k | if (debug >= 2) { |
562 | 0 | tprintf( |
563 | 0 | "Most characters of %s are subscript or superscript.\n" |
564 | 0 | "That seems wrong, so I'll assume we got the baseline wrong\n", |
565 | 0 | unichar_string().c_str()); |
566 | 0 | } |
567 | 226k | for (unsigned i = 0; i < length_; i++) { |
568 | 188k | ScriptPos sp = script_pos_[i]; |
569 | 188k | if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { |
570 | 175k | ASSERT_HOST(position_counts[sp] > 0); |
571 | 175k | position_counts[sp]--; |
572 | 175k | position_counts[tesseract::SP_NORMAL]++; |
573 | 175k | script_pos_[i] = tesseract::SP_NORMAL; |
574 | 175k | } |
575 | 188k | } |
576 | 37.8k | } |
577 | | |
578 | 399k | if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) { |
579 | 0 | tprintf("SetScriptPosition on %s\n", unichar_string().c_str()); |
580 | 0 | int chunk_index = 0; |
581 | 0 | for (unsigned blob_index = 0; blob_index < length_; ++blob_index) { |
582 | 0 | if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { |
583 | 0 | TBLOB *tblob = word->blobs[chunk_index]; |
584 | 0 | ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index)); |
585 | 0 | } |
586 | 0 | chunk_index += state_.empty() ? 1 : state_[blob_index]; |
587 | 0 | } |
588 | 0 | } |
589 | 399k | } |
590 | | |
591 | | // Sets all the script_pos_ positions to the given position. |
592 | 5.33k | void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) { |
593 | 24.2k | for (unsigned i = 0; i < length_; ++i) { |
594 | 18.8k | script_pos_[i] = position; |
595 | 18.8k | } |
596 | 5.33k | } |
597 | | |
598 | | /* static */ |
599 | | ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, |
600 | 2.77M | const TBOX &blob_box, UNICHAR_ID unichar_id) { |
601 | 2.77M | ScriptPos retval = tesseract::SP_NORMAL; |
602 | 2.77M | int top = blob_box.top(); |
603 | 2.77M | int bottom = blob_box.bottom(); |
604 | 2.77M | int min_bottom, max_bottom, min_top, max_top; |
605 | 2.77M | unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); |
606 | | |
607 | 2.77M | int sub_thresh_top = min_top - kMinSubscriptOffset; |
608 | 2.77M | int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; |
609 | 2.77M | int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; |
610 | 2.77M | if (bottom <= kMaxDropCapBottom) { |
611 | 406k | retval = tesseract::SP_DROPCAP; |
612 | 2.37M | } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { |
613 | 391k | retval = tesseract::SP_SUBSCRIPT; |
614 | 1.98M | } else if (bottom > sup_thresh_bot) { |
615 | 638k | retval = tesseract::SP_SUPERSCRIPT; |
616 | 638k | } |
617 | | |
618 | 2.77M | if (print_debug) { |
619 | 0 | const char *pos = ScriptPosToString(retval); |
620 | 0 | tprintf( |
621 | 0 | "%s Character %s[bot:%d top: %d] " |
622 | 0 | "bot_range[%d,%d] top_range[%d, %d] " |
623 | 0 | "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", |
624 | 0 | pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top, |
625 | 0 | max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot); |
626 | 0 | } |
627 | 2.77M | return retval; |
628 | 2.77M | } |
629 | | |
630 | | // Returns the script-id (eg Han) of the dominant script in the word. |
631 | 0 | int WERD_CHOICE::GetTopScriptID() const { |
632 | 0 | unsigned max_script = unicharset_->get_script_table_size(); |
633 | 0 | std::vector<unsigned> sid(max_script); |
634 | 0 | for (unsigned x = 0; x < length_; ++x) { |
635 | 0 | int script_id = unicharset_->get_script(unichar_id(x)); |
636 | 0 | sid[script_id]++; |
637 | 0 | } |
638 | 0 | if (unicharset_->han_sid() != unicharset_->null_sid()) { |
639 | | // Add the Hiragana & Katakana counts to Han and zero them out. |
640 | 0 | if (unicharset_->hiragana_sid() != unicharset_->null_sid()) { |
641 | 0 | sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()]; |
642 | 0 | sid[unicharset_->hiragana_sid()] = 0; |
643 | 0 | } |
644 | 0 | if (unicharset_->katakana_sid() != unicharset_->null_sid()) { |
645 | 0 | sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()]; |
646 | 0 | sid[unicharset_->katakana_sid()] = 0; |
647 | 0 | } |
648 | 0 | } |
649 | | // Note that high script ID overrides lower one on a tie, thus biasing |
650 | | // towards non-Common script (if sorted that way in unicharset file). |
651 | 0 | unsigned max_sid = 0; |
652 | 0 | for (unsigned x = 1; x < max_script; x++) { |
653 | 0 | if (sid[x] >= sid[max_sid]) { |
654 | 0 | max_sid = x; |
655 | 0 | } |
656 | 0 | } |
657 | 0 | if (sid[max_sid] < length_ / 2) { |
658 | 0 | max_sid = unicharset_->null_sid(); |
659 | 0 | } |
660 | 0 | return max_sid; |
661 | 0 | } |
662 | | |
663 | | // Fixes the state_ for a chop at the given blob_posiiton. |
664 | 567k | void WERD_CHOICE::UpdateStateForSplit(int blob_position) { |
665 | 567k | int total_chunks = 0; |
666 | 3.07M | for (unsigned i = 0; i < length_; ++i) { |
667 | 3.07M | total_chunks += state_[i]; |
668 | 3.07M | if (total_chunks > blob_position) { |
669 | 567k | ++state_[i]; |
670 | 567k | return; |
671 | 567k | } |
672 | 3.07M | } |
673 | 567k | } |
674 | | |
675 | | // Returns the sum of all the state elements, being the total number of blobs. |
676 | 5.63M | unsigned WERD_CHOICE::TotalOfStates() const { |
677 | 5.63M | unsigned total_chunks = 0; |
678 | 54.2M | for (unsigned i = 0; i < length_; ++i) { |
679 | 48.6M | total_chunks += state_[i]; |
680 | 48.6M | } |
681 | 5.63M | return total_chunks; |
682 | 5.63M | } |
683 | | |
684 | | /** |
685 | | * WERD_CHOICE::print |
686 | | * |
687 | | * Print WERD_CHOICE to stdout. |
688 | | */ |
689 | 0 | void WERD_CHOICE::print(const char *msg) const { |
690 | 0 | tprintf("%s : ", msg); |
691 | 0 | for (unsigned i = 0; i < length_; ++i) { |
692 | 0 | tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i])); |
693 | 0 | } |
694 | 0 | tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_, |
695 | 0 | adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_); |
696 | 0 | tprintf("pos"); |
697 | 0 | for (unsigned i = 0; i < length_; ++i) { |
698 | 0 | tprintf("\t%s", ScriptPosToString(script_pos_[i])); |
699 | 0 | } |
700 | 0 | tprintf("\nstr"); |
701 | 0 | for (unsigned i = 0; i < length_; ++i) { |
702 | 0 | tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i])); |
703 | 0 | } |
704 | 0 | tprintf("\nstate:"); |
705 | 0 | for (unsigned i = 0; i < length_; ++i) { |
706 | 0 | tprintf("\t%d ", state_[i]); |
707 | 0 | } |
708 | 0 | tprintf("\nC"); |
709 | 0 | for (unsigned i = 0; i < length_; ++i) { |
710 | 0 | tprintf("\t%.3f", certainties_[i]); |
711 | 0 | } |
712 | 0 | tprintf("\n"); |
713 | 0 | } |
714 | | |
715 | | // Prints the segmentation state with an introductory message. |
716 | 0 | void WERD_CHOICE::print_state(const char *msg) const { |
717 | 0 | tprintf("%s", msg); |
718 | 0 | for (unsigned i = 0; i < length_; ++i) { |
719 | 0 | tprintf(" %d", state_[i]); |
720 | 0 | } |
721 | 0 | tprintf("\n"); |
722 | 0 | } |
723 | | |
724 | | #ifndef GRAPHICS_DISABLED |
725 | | |
726 | | // Displays the segmentation state of *this (if not the same as the last |
727 | | // one displayed) and waits for a click in the window. |
728 | | void WERD_CHOICE::DisplaySegmentation(TWERD *word) { |
729 | | // Number of different colors to draw with. |
730 | | const int kNumColors = 6; |
731 | | static ScrollView *segm_window = nullptr; |
732 | | // Check the state against the static prev_drawn_state. |
733 | | static std::vector<int> prev_drawn_state; |
734 | | bool already_done = prev_drawn_state.size() == length_; |
735 | | if (!already_done) { |
736 | | prev_drawn_state.clear(); |
737 | | prev_drawn_state.resize(length_); |
738 | | } |
739 | | for (unsigned i = 0; i < length_; ++i) { |
740 | | if (prev_drawn_state[i] != state_[i]) { |
741 | | already_done = false; |
742 | | } |
743 | | prev_drawn_state[i] = state_[i]; |
744 | | } |
745 | | if (already_done || word->blobs.empty()) { |
746 | | return; |
747 | | } |
748 | | |
749 | | // Create the window if needed. |
750 | | if (segm_window == nullptr) { |
751 | | segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true); |
752 | | } else { |
753 | | segm_window->Clear(); |
754 | | } |
755 | | |
756 | | TBOX bbox; |
757 | | int blob_index = 0; |
758 | | for (unsigned c = 0; c < length_; ++c) { |
759 | | auto color = static_cast<ScrollView::Color>(c % kNumColors + 3); |
760 | | for (int i = 0; i < state_[c]; ++i, ++blob_index) { |
761 | | TBLOB *blob = word->blobs[blob_index]; |
762 | | bbox += blob->bounding_box(); |
763 | | blob->plot(segm_window, color, color); |
764 | | } |
765 | | } |
766 | | segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom()); |
767 | | segm_window->Update(); |
768 | | segm_window->Wait(); |
769 | | } |
770 | | |
771 | | #endif // !GRAPHICS_DISABLED |
772 | | |
773 | 0 | bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) { |
774 | 0 | const UNICHARSET *uchset = word1.unicharset(); |
775 | 0 | if (word2.unicharset() != uchset) { |
776 | 0 | return false; |
777 | 0 | } |
778 | 0 | unsigned w1start, w1end; |
779 | 0 | word1.punct_stripped(&w1start, &w1end); |
780 | 0 | unsigned w2start, w2end; |
781 | 0 | word2.punct_stripped(&w2start, &w2end); |
782 | 0 | if (w1end - w1start != w2end - w2start) { |
783 | 0 | return false; |
784 | 0 | } |
785 | 0 | for (unsigned i = 0; i < w1end - w1start; i++) { |
786 | 0 | if (uchset->to_lower(word1.unichar_id(w1start + i)) != |
787 | 0 | uchset->to_lower(word2.unichar_id(w2start + i))) { |
788 | 0 | return false; |
789 | 0 | } |
790 | 0 | } |
791 | 0 | return true; |
792 | 0 | } |
793 | | |
794 | | /** |
795 | | * print_ratings_list |
796 | | * |
797 | | * Send all the ratings out to the logfile. |
798 | | * |
799 | | * @param msg intro message |
800 | | * @param ratings list of ratings |
801 | | * @param current_unicharset unicharset that can be used |
802 | | * for id-to-unichar conversion |
803 | | */ |
804 | | void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, |
805 | 0 | const UNICHARSET ¤t_unicharset) { |
806 | 0 | if (ratings->empty()) { |
807 | 0 | tprintf("%s:<none>\n", msg); |
808 | 0 | return; |
809 | 0 | } |
810 | 0 | if (*msg != '\0') { |
811 | 0 | tprintf("%s\n", msg); |
812 | 0 | } |
813 | 0 | BLOB_CHOICE_IT c_it; |
814 | 0 | c_it.set_to_list(ratings); |
815 | 0 | for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { |
816 | 0 | c_it.data()->print(¤t_unicharset); |
817 | 0 | if (!c_it.at_last()) { |
818 | 0 | tprintf("\n"); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | tprintf("\n"); |
822 | 0 | fflush(stdout); |
823 | 0 | } |
824 | | |
825 | | } // namespace tesseract |