/src/tesseract/src/ccstruct/ratngs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: ratngs.h (Formerly ratings.h) |
3 | | * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef RATNGS_H |
20 | | #define RATNGS_H |
21 | | |
22 | | #ifdef HAVE_CONFIG_H |
23 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
24 | | #endif |
25 | | |
26 | | #include "clst.h" |
27 | | #include "elst.h" |
28 | | #ifndef DISABLED_LEGACY_ENGINE |
29 | | # include "fontinfo.h" |
30 | | #endif // undef DISABLED_LEGACY_ENGINE |
31 | | #include "matrix.h" |
32 | | #include "unicharset.h" |
33 | | #include "werd.h" |
34 | | |
35 | | #include <tesseract/unichar.h> |
36 | | |
37 | | #include <cassert> |
38 | | #include <cfloat> // for FLT_MAX |
39 | | |
40 | | namespace tesseract { |
41 | | |
42 | | class MATRIX; |
43 | | struct TBLOB; |
44 | | struct TWERD; |
45 | | |
46 | | // Enum to describe the source of a BLOB_CHOICE to make it possible to determine |
47 | | // whether a blob has been classified by inspecting the BLOB_CHOICEs. |
48 | | enum BlobChoiceClassifier { |
49 | | BCC_STATIC_CLASSIFIER, // From the char_norm classifier. |
50 | | BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier. |
51 | | BCC_SPECKLE_CLASSIFIER, // Backup for failed classification. |
52 | | BCC_AMBIG, // Generated by ambiguity detection. |
53 | | BCC_FAKE, // From some other process. |
54 | | }; |
55 | | |
56 | | class BLOB_CHOICE : public ELIST<BLOB_CHOICE>::LINK { |
57 | | public: |
58 | 371 | BLOB_CHOICE() { |
59 | 371 | unichar_id_ = UNICHAR_SPACE; |
60 | 371 | fontinfo_id_ = -1; |
61 | 371 | fontinfo_id2_ = -1; |
62 | 371 | rating_ = 10.0f; |
63 | 371 | certainty_ = -1.0f; |
64 | 371 | script_id_ = -1; |
65 | 371 | min_xheight_ = 0.0f; |
66 | 371 | max_xheight_ = 0.0f; |
67 | 371 | yshift_ = 0.0f; |
68 | 371 | classifier_ = BCC_FAKE; |
69 | 371 | } |
70 | | BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id |
71 | | float src_rating, // rating |
72 | | float src_cert, // certainty |
73 | | int script_id, // script |
74 | | float min_xheight, // min xheight in image pixel units |
75 | | float max_xheight, // max xheight allowed by this char |
76 | | float yshift, // the larger of y shift (top or bottom) |
77 | | BlobChoiceClassifier c); // adapted match or other |
78 | | BLOB_CHOICE(const BLOB_CHOICE &other); |
79 | 11.0M | ~BLOB_CHOICE() = default; |
80 | | |
81 | 300M | UNICHAR_ID unichar_id() const { |
82 | 300M | return unichar_id_; |
83 | 300M | } |
84 | 70.7M | float rating() const { |
85 | 70.7M | return rating_; |
86 | 70.7M | } |
87 | 45.1M | float certainty() const { |
88 | 45.1M | return certainty_; |
89 | 45.1M | } |
90 | 42.4M | int16_t fontinfo_id() const { |
91 | 42.4M | return fontinfo_id_; |
92 | 42.4M | } |
93 | 35.6M | int16_t fontinfo_id2() const { |
94 | 35.6M | return fontinfo_id2_; |
95 | 35.6M | } |
96 | | #ifndef DISABLED_LEGACY_ENGINE |
97 | 177k | const std::vector<ScoredFont> &fonts() const { |
98 | 177k | return fonts_; |
99 | 177k | } |
100 | 3.62M | void set_fonts(const std::vector<ScoredFont> &fonts) { |
101 | 3.62M | fonts_ = fonts; |
102 | 3.62M | int score1 = 0, score2 = 0; |
103 | 3.62M | fontinfo_id_ = -1; |
104 | 3.62M | fontinfo_id2_ = -1; |
105 | 111M | for (auto &f : fonts_) { |
106 | 111M | if (f.score > score1) { |
107 | 15.5M | score2 = score1; |
108 | 15.5M | fontinfo_id2_ = fontinfo_id_; |
109 | 15.5M | score1 = f.score; |
110 | 15.5M | fontinfo_id_ = f.fontinfo_id; |
111 | 95.8M | } else if (f.score > score2) { |
112 | 9.31M | score2 = f.score; |
113 | 9.31M | fontinfo_id2_ = f.fontinfo_id; |
114 | 9.31M | } |
115 | 111M | } |
116 | 3.62M | } |
117 | | #endif // ndef DISABLED_LEGACY_ENGINE |
118 | 22.0k | int script_id() const { |
119 | 22.0k | return script_id_; |
120 | 22.0k | } |
121 | 8.58M | const MATRIX_COORD &matrix_cell() { |
122 | 8.58M | return matrix_cell_; |
123 | 8.58M | } |
124 | 8.33M | float min_xheight() const { |
125 | 8.33M | return min_xheight_; |
126 | 8.33M | } |
127 | 8.33M | float max_xheight() const { |
128 | 8.33M | return max_xheight_; |
129 | 8.33M | } |
130 | 14.0M | float yshift() const { |
131 | 14.0M | return yshift_; |
132 | 14.0M | } |
133 | 0 | BlobChoiceClassifier classifier() const { |
134 | 0 | return classifier_; |
135 | 0 | } |
136 | 8.79M | bool IsAdapted() const { |
137 | 8.79M | return classifier_ == BCC_ADAPTED_CLASSIFIER; |
138 | 8.79M | } |
139 | 1.39M | bool IsClassified() const { |
140 | 1.39M | return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER || |
141 | 1.39M | classifier_ == BCC_SPECKLE_CLASSIFIER; |
142 | 1.39M | } |
143 | | |
144 | 22.4k | void set_unichar_id(UNICHAR_ID newunichar_id) { |
145 | 22.4k | unichar_id_ = newunichar_id; |
146 | 22.4k | } |
147 | 24.4k | void set_rating(float newrat) { |
148 | 24.4k | rating_ = newrat; |
149 | 24.4k | } |
150 | 22.7k | void set_certainty(float newrat) { |
151 | 22.7k | certainty_ = newrat; |
152 | 22.7k | } |
153 | 0 | void set_script(int newscript_id) { |
154 | 0 | script_id_ = newscript_id; |
155 | 0 | } |
156 | 12.0M | void set_matrix_cell(int col, int row) { |
157 | 12.0M | matrix_cell_.col = col; |
158 | 12.0M | matrix_cell_.row = row; |
159 | 12.0M | } |
160 | 22.0k | void set_classifier(BlobChoiceClassifier classifier) { |
161 | 22.0k | classifier_ = classifier; |
162 | 22.0k | } |
163 | 0 | static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) { |
164 | 0 | auto *choice = new BLOB_CHOICE; |
165 | 0 | *choice = *src; |
166 | 0 | return choice; |
167 | 0 | } |
168 | | // Returns true if *this and other agree on the baseline and x-height |
169 | | // to within some tolerance based on a given estimate of the x-height. |
170 | | bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const; |
171 | | |
172 | 0 | void print(const UNICHARSET *unicharset) const { |
173 | 0 | tprintf("r%.2f c%.2f x[%g,%g]: %d %s", |
174 | 0 | static_cast<double>(rating_), |
175 | 0 | static_cast<double>(certainty_), |
176 | 0 | static_cast<double>(min_xheight_), |
177 | 0 | static_cast<double>(max_xheight_), |
178 | 0 | unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str()); |
179 | 0 | } |
180 | 0 | void print_full() const { |
181 | 0 | print(nullptr); |
182 | 0 | tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_, |
183 | 0 | fontinfo_id2_, static_cast<double>(yshift_), classifier_); |
184 | 0 | } |
185 | | // Sort function for sorting BLOB_CHOICEs in increasing order of rating. |
186 | 0 | static int SortByRating(const void *p1, const void *p2) { |
187 | 0 | const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1); |
188 | 0 | const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2); |
189 | 0 | return (bc1->rating_ < bc2->rating_) ? -1 : 1; |
190 | 0 | } |
191 | | |
192 | | private: |
193 | | // Copy assignment operator. |
194 | | BLOB_CHOICE &operator=(const BLOB_CHOICE &other); |
195 | | |
196 | | UNICHAR_ID unichar_id_; // unichar id |
197 | | #ifndef DISABLED_LEGACY_ENGINE |
198 | | // Fonts and scores. Allowed to be empty. |
199 | | std::vector<ScoredFont> fonts_; |
200 | | #endif // ndef DISABLED_LEGACY_ENGINE |
201 | | int16_t fontinfo_id_; // char font information |
202 | | int16_t fontinfo_id2_; // 2nd choice font information |
203 | | // Rating is the classifier distance weighted by the length of the outline |
204 | | // in the blob. In terms of probability, classifier distance is -klog p such |
205 | | // that the resulting distance is in the range [0, 1] and then |
206 | | // rating = w (-k log p) where w is the weight for the length of the outline. |
207 | | // Sums of ratings may be compared meaningfully for words of different |
208 | | // segmentation. |
209 | | float rating_; // size related |
210 | | // Certainty is a number in [-20, 0] indicating the classifier certainty |
211 | | // of the choice. In terms of probability, certainty = 20 (k log p) where |
212 | | // k is defined as above to normalize -klog p to the range [0, 1]. |
213 | | float certainty_; // absolute |
214 | | int script_id_; |
215 | | // Holds the position of this choice in the ratings matrix. |
216 | | // Used to location position in the matrix during path backtracking. |
217 | | MATRIX_COORD matrix_cell_; |
218 | | // X-height range (in image pixels) that this classification supports. |
219 | | float min_xheight_; |
220 | | float max_xheight_; |
221 | | // yshift_ - The vertical distance (in image pixels) the character is |
222 | | // shifted (up or down) from an acceptable y position. |
223 | | float yshift_; |
224 | | BlobChoiceClassifier classifier_; // What generated *this. |
225 | | }; |
226 | | |
227 | | // Make BLOB_CHOICE listable. |
228 | | ELISTIZEH(BLOB_CHOICE) |
229 | | |
230 | | // Return the BLOB_CHOICE in bc_list matching a given unichar_id, |
231 | | // or nullptr if there is no match. |
232 | | BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list); |
233 | | |
234 | | // Permuter codes used in WERD_CHOICEs. |
235 | | enum PermuterType { |
236 | | NO_PERM, // 0 |
237 | | PUNC_PERM, // 1 |
238 | | TOP_CHOICE_PERM, // 2 |
239 | | LOWER_CASE_PERM, // 3 |
240 | | UPPER_CASE_PERM, // 4 |
241 | | NGRAM_PERM, // 5 |
242 | | NUMBER_PERM, // 6 |
243 | | USER_PATTERN_PERM, // 7 |
244 | | SYSTEM_DAWG_PERM, // 8 |
245 | | DOC_DAWG_PERM, // 9 |
246 | | USER_DAWG_PERM, // 10 |
247 | | FREQ_DAWG_PERM, // 11 |
248 | | COMPOUND_PERM, // 12 |
249 | | |
250 | | NUM_PERMUTER_TYPES |
251 | | }; |
252 | | |
253 | | // ScriptPos tells whether a character is subscript, superscript or normal. |
254 | | enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }; |
255 | | |
256 | | const char *ScriptPosToString(ScriptPos script_pos); |
257 | | |
258 | | class TESS_API WERD_CHOICE : public ELIST<WERD_CHOICE>::LINK { |
259 | | public: |
260 | | static const float kBadRating; |
261 | | static const char *permuter_name(uint8_t permuter); |
262 | | |
263 | 644k | WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) { |
264 | 644k | this->init(8); |
265 | 644k | } |
266 | 581k | WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) { |
267 | 581k | this->init(reserved); |
268 | 581k | } |
269 | | WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating, |
270 | | float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset) |
271 | 0 | : unicharset_(&unicharset) { |
272 | 0 | this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter); |
273 | 0 | } |
274 | | WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); |
275 | 327k | WERD_CHOICE(const WERD_CHOICE &word) : ELIST<WERD_CHOICE>::LINK(word), unicharset_(word.unicharset_) { |
276 | 327k | this->init(word.length()); |
277 | 327k | this->operator=(word); |
278 | 327k | } |
279 | | ~WERD_CHOICE(); |
280 | | |
281 | 1.61M | const UNICHARSET *unicharset() const { |
282 | 1.61M | return unicharset_; |
283 | 1.61M | } |
284 | 906k | bool empty() const { |
285 | 906k | return length_ == 0; |
286 | 906k | } |
287 | 54.0M | inline unsigned length() const { |
288 | 54.0M | return length_; |
289 | 54.0M | } |
290 | 862k | float adjust_factor() const { |
291 | 862k | return adjust_factor_; |
292 | 862k | } |
293 | 331k | void set_adjust_factor(float factor) { |
294 | 331k | adjust_factor_ = factor; |
295 | 331k | } |
296 | 499k | inline const std::vector<UNICHAR_ID> &unichar_ids() const { |
297 | 499k | return unichar_ids_; |
298 | 499k | } |
299 | 38.7M | inline UNICHAR_ID unichar_id(unsigned index) const { |
300 | 38.7M | assert(index < length_); |
301 | 38.7M | return unichar_ids_[index]; |
302 | 38.7M | } |
303 | 21.4M | inline unsigned state(unsigned index) const { |
304 | 21.4M | return state_[index]; |
305 | 21.4M | } |
306 | 5.25M | ScriptPos BlobPosition(unsigned index) const { |
307 | 5.25M | if (index >= length_) { |
308 | 0 | return SP_NORMAL; |
309 | 0 | } |
310 | 5.25M | return script_pos_[index]; |
311 | 5.25M | } |
312 | 5.91M | inline float rating() const { |
313 | 5.91M | return rating_; |
314 | 5.91M | } |
315 | 1.76M | inline float certainty() const { |
316 | 1.76M | return certainty_; |
317 | 1.76M | } |
318 | 1.30M | inline float certainty(unsigned index) const { |
319 | 1.30M | return certainties_[index]; |
320 | 1.30M | } |
321 | 327k | inline float min_x_height() const { |
322 | 327k | return min_x_height_; |
323 | 327k | } |
324 | 327k | inline float max_x_height() const { |
325 | 327k | return max_x_height_; |
326 | 327k | } |
327 | 331k | inline void set_x_heights(float min_height, float max_height) { |
328 | 331k | min_x_height_ = min_height; |
329 | 331k | max_x_height_ = max_height; |
330 | 331k | } |
331 | 1.44M | inline uint8_t permuter() const { |
332 | 1.44M | return permuter_; |
333 | 1.44M | } |
334 | | const char *permuter_name() const; |
335 | | // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, |
336 | | // taken from the appropriate cell in the ratings MATRIX. |
337 | | // Borrowed pointer, so do not delete. |
338 | | BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const; |
339 | | |
340 | | // Returns the MATRIX_COORD corresponding to the location in the ratings |
341 | | // MATRIX for the given index into the word. |
342 | | MATRIX_COORD MatrixCoord(unsigned index) const; |
343 | | |
344 | 409 | inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) { |
345 | 409 | assert(index < length_); |
346 | 409 | unichar_ids_[index] = unichar_id; |
347 | 409 | } |
348 | 187k | bool dangerous_ambig_found() const { |
349 | 187k | return dangerous_ambig_found_; |
350 | 187k | } |
351 | 331k | void set_dangerous_ambig_found_(bool value) { |
352 | 331k | dangerous_ambig_found_ = value; |
353 | 331k | } |
354 | 1.25M | inline void set_rating(float new_val) { |
355 | 1.25M | rating_ = new_val; |
356 | 1.25M | } |
357 | 755k | inline void set_certainty(float new_val) { |
358 | 755k | certainty_ = new_val; |
359 | 755k | } |
360 | 928k | inline void set_permuter(uint8_t perm) { |
361 | 928k | permuter_ = perm; |
362 | 928k | } |
363 | | // Note: this function should only be used if all the fields |
364 | | // are populated manually with set_* functions (rather than |
365 | | // (copy)constructors and append_* functions). |
366 | 331k | inline void set_length(unsigned len) { |
367 | 331k | ASSERT_HOST(reserved_ >= len); |
368 | 331k | length_ = len; |
369 | 331k | } |
370 | | |
371 | | /// Make more space in unichar_id_ and fragment_lengths_ arrays. |
372 | 752k | inline void double_the_size() { |
373 | 752k | if (reserved_ > 0) { |
374 | 752k | reserved_ *= 2; |
375 | 752k | } else { |
376 | 0 | reserved_ = 1; |
377 | 0 | } |
378 | 752k | unichar_ids_.resize(reserved_); |
379 | 752k | script_pos_.resize(reserved_); |
380 | 752k | state_.resize(reserved_); |
381 | 752k | certainties_.resize(reserved_); |
382 | 752k | } |
383 | | |
384 | | /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and |
385 | | /// fragment_length_ arrays. Sets other values to default (blank) values. |
386 | 1.55M | inline void init(unsigned reserved) { |
387 | 1.55M | reserved_ = reserved; |
388 | 1.55M | if (reserved > 0) { |
389 | 1.55M | unichar_ids_.resize(reserved); |
390 | 1.55M | script_pos_.resize(reserved); |
391 | 1.55M | state_.resize(reserved); |
392 | 1.55M | certainties_.resize(reserved); |
393 | 1.55M | } else { |
394 | 124 | unichar_ids_.clear(); |
395 | 124 | script_pos_.clear(); |
396 | 124 | state_.clear(); |
397 | 124 | certainties_.clear(); |
398 | 124 | } |
399 | 1.55M | length_ = 0; |
400 | 1.55M | adjust_factor_ = 1.0f; |
401 | 1.55M | rating_ = 0.0; |
402 | 1.55M | certainty_ = FLT_MAX; |
403 | 1.55M | min_x_height_ = 0.0f; |
404 | 1.55M | max_x_height_ = FLT_MAX; |
405 | 1.55M | permuter_ = NO_PERM; |
406 | 1.55M | unichars_in_script_order_ = false; // Tesseract is strict left-to-right. |
407 | 1.55M | dangerous_ambig_found_ = false; |
408 | 1.55M | } |
409 | | |
410 | | /// Helper function to build a WERD_CHOICE from the given string, |
411 | | /// fragment lengths, rating, certainty and permuter. |
412 | | /// The function assumes that src_string is not nullptr. |
413 | | /// src_lengths argument could be nullptr, in which case the unichars |
414 | | /// in src_string are assumed to all be of length 1. |
415 | | void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, |
416 | | uint8_t src_permuter); |
417 | | |
418 | | /// Set the fields in this choice to be default (bad) values. |
419 | 211k | inline void make_bad() { |
420 | 211k | length_ = 0; |
421 | 211k | rating_ = kBadRating; |
422 | 211k | certainty_ = -FLT_MAX; |
423 | 211k | } |
424 | | |
425 | | /// This function assumes that there is enough space reserved |
426 | | /// in the WERD_CHOICE for adding another unichar. |
427 | | /// This is an efficient alternative to append_unichar_id(). |
428 | | inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, |
429 | 2.01M | float certainty) { |
430 | 2.01M | assert(reserved_ > length_); |
431 | 2.01M | length_++; |
432 | 2.01M | this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1); |
433 | 2.01M | } |
434 | | |
435 | | void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty); |
436 | | |
437 | | inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, |
438 | 2.01M | unsigned index) { |
439 | 2.01M | assert(index < length_); |
440 | 2.01M | unichar_ids_[index] = unichar_id; |
441 | 2.01M | state_[index] = blob_count; |
442 | 2.01M | certainties_[index] = certainty; |
443 | 2.01M | script_pos_[index] = SP_NORMAL; |
444 | 2.01M | rating_ += rating; |
445 | 2.01M | if (certainty < certainty_) { |
446 | 754k | certainty_ = certainty; |
447 | 754k | } |
448 | 2.01M | } |
449 | | // Sets the entries for the given index from the BLOB_CHOICE, assuming |
450 | | // unit fragment lengths, but setting the state for this index to blob_count. |
451 | | void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice); |
452 | | |
453 | | bool contains_unichar_id(UNICHAR_ID unichar_id) const; |
454 | | void remove_unichar_ids(unsigned index, int num); |
455 | 470k | inline void remove_last_unichar_id() { |
456 | 470k | --length_; |
457 | 470k | } |
458 | 102k | inline void remove_unichar_id(unsigned index) { |
459 | 102k | this->remove_unichar_ids(index, 1); |
460 | 102k | } |
461 | | bool has_rtl_unichar_id() const; |
462 | | void reverse_and_mirror_unichar_ids(); |
463 | | |
464 | | // Returns the half-open interval of unichar_id indices [start, end) which |
465 | | // enclose the core portion of this word -- the part after stripping |
466 | | // punctuation from the left and right. |
467 | | void punct_stripped(unsigned *start_core, unsigned *end_core) const; |
468 | | |
469 | | // Returns the indices [start, end) containing the core of the word, stripped |
470 | | // of any superscript digits on either side. (i.e., the non-footnote part |
471 | | // of the word). There is no guarantee that the output range is non-empty. |
472 | | void GetNonSuperscriptSpan(int *start, int *end) const; |
473 | | |
474 | | // Return a copy of this WERD_CHOICE with the choices [start, end). |
475 | | // The result is useful only for checking against a dictionary. |
476 | | WERD_CHOICE shallow_copy(unsigned start, unsigned end) const; |
477 | | |
478 | | void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const; |
479 | 0 | std::string debug_string() const { |
480 | 0 | std::string word_str; |
481 | 0 | for (unsigned i = 0; i < length_; ++i) { |
482 | 0 | word_str += unicharset_->debug_str(unichar_ids_[i]); |
483 | 0 | word_str += " "; |
484 | 0 | } |
485 | 0 | return word_str; |
486 | 0 | } |
487 | | // Returns true if any unichar_id in the word is a non-space-delimited char. |
488 | 0 | bool ContainsAnyNonSpaceDelimited() const { |
489 | 0 | for (unsigned i = 0; i < length_; ++i) { |
490 | 0 | if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) { |
491 | 0 | return true; |
492 | 0 | } |
493 | 0 | } |
494 | 0 | return false; |
495 | 0 | } |
496 | | // Returns true if the word is all spaces. |
497 | 53.0k | bool IsAllSpaces() const { |
498 | 53.1k | for (unsigned i = 0; i < length_; ++i) { |
499 | 53.0k | if (unichar_ids_[i] != UNICHAR_SPACE) { |
500 | 52.9k | return false; |
501 | 52.9k | } |
502 | 53.0k | } |
503 | 54 | return true; |
504 | 53.0k | } |
505 | | |
506 | | // Call this to override the default (strict left to right graphemes) |
507 | | // with the fact that some engine produces a "reading order" set of |
508 | | // Graphemes for each word. |
509 | 0 | bool set_unichars_in_script_order(bool in_script_order) { |
510 | 0 | return unichars_in_script_order_ = in_script_order; |
511 | 0 | } |
512 | | |
513 | 0 | bool unichars_in_script_order() const { |
514 | 0 | return unichars_in_script_order_; |
515 | 0 | } |
516 | | |
517 | | // Returns a UTF-8 string equivalent to the current choice |
518 | | // of UNICHAR IDs. |
519 | 2.59M | std::string &unichar_string() { |
520 | 2.59M | this->string_and_lengths(&unichar_string_, &unichar_lengths_); |
521 | 2.59M | return unichar_string_; |
522 | 2.59M | } |
523 | | |
524 | | // Returns a UTF-8 string equivalent to the current choice |
525 | | // of UNICHAR IDs. |
526 | 0 | const std::string &unichar_string() const { |
527 | 0 | this->string_and_lengths(&unichar_string_, &unichar_lengths_); |
528 | 0 | return unichar_string_; |
529 | 0 | } |
530 | | |
531 | | // Returns the lengths, one byte each, representing the number of bytes |
532 | | // required in the unichar_string for each UNICHAR_ID. |
533 | 229k | const std::string &unichar_lengths() const { |
534 | 229k | this->string_and_lengths(&unichar_string_, &unichar_lengths_); |
535 | 229k | return unichar_lengths_; |
536 | 229k | } |
537 | | |
538 | | // Sets up the script_pos_ member using the blobs_list to get the bln |
539 | | // bounding boxes, *this to get the unichars, and this->unicharset |
540 | | // to get the target positions. If small_caps is true, sub/super are not |
541 | | // considered, but dropcaps are. |
542 | | // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.) |
543 | | void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0); |
544 | | // Sets all the script_pos_ positions to the given position. |
545 | | void SetAllScriptPositions(ScriptPos position); |
546 | | |
547 | | static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, |
548 | | const TBOX &blob_box, UNICHAR_ID unichar_id); |
549 | | |
550 | | // Returns the "dominant" script ID for the word. By "dominant", the script |
551 | | // must account for at least half the characters. Otherwise, it returns 0. |
552 | | // Note that for Japanese, Hiragana and Katakana are simply treated as Han. |
553 | | int GetTopScriptID() const; |
554 | | |
555 | | // Fixes the state_ for a chop at the given blob_posiiton. |
556 | | void UpdateStateForSplit(int blob_position); |
557 | | |
558 | | // Returns the sum of all the state elements, being the total number of blobs. |
559 | | unsigned TotalOfStates() const; |
560 | | |
561 | 0 | void print() const { |
562 | 0 | this->print(""); |
563 | 0 | } |
564 | | void print(const char *msg) const; |
565 | | // Prints the segmentation state with an introductory message. |
566 | | void print_state(const char *msg) const; |
567 | | |
568 | | // Displays the segmentation state of *this (if not the same as the last |
569 | | // one displayed) and waits for a click in the window. |
570 | | void DisplaySegmentation(TWERD *word); |
571 | | |
572 | | WERD_CHOICE &operator+=( // concatanate |
573 | | const WERD_CHOICE &second); // second on first |
574 | | |
575 | | WERD_CHOICE &operator=(const WERD_CHOICE &source); |
576 | | |
577 | | private: |
578 | | const UNICHARSET *unicharset_; |
579 | | // TODO(rays) Perhaps replace the multiple arrays with an array of structs? |
580 | | // unichar_ids_ is an array of classifier "results" that make up a word. |
581 | | // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position |
582 | | // of each unichar_id. |
583 | | // state_[i] indicates the number of blobs in WERD_RES::chopped_word that |
584 | | // were put together to make the classification results in the ith position |
585 | | // in unichar_ids_, and certainties_[i] is the certainty of the choice that |
586 | | // was used in this word. |
587 | | // == Change from before == |
588 | | // Previously there was fragment_lengths_ that allowed a word to be |
589 | | // artificially composed of multiple fragment results. Since the new |
590 | | // segmentation search doesn't do fragments, treatment of fragments has |
591 | | // been moved to a lower level, augmenting the ratings matrix with the |
592 | | // combined fragments, and allowing the language-model/segmentation-search |
593 | | // to deal with only the combined unichar_ids. |
594 | | std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word |
595 | | std::vector<ScriptPos> script_pos_; // Normal/Sub/Superscript of each unichar. |
596 | | std::vector<int> state_; // Number of blobs in each unichar. |
597 | | std::vector<float> certainties_; // Certainty of each unichar. |
598 | | unsigned reserved_; // size of the above arrays |
599 | | unsigned length_; // word length |
600 | | // Factor that was used to adjust the rating. |
601 | | float adjust_factor_; |
602 | | // Rating is the sum of the ratings of the individual blobs in the word. |
603 | | float rating_; // size related |
604 | | // certainty is the min (worst) certainty of the individual blobs in the word. |
605 | | float certainty_; // absolute |
606 | | // xheight computed from the result, or 0 if inconsistent. |
607 | | float min_x_height_; |
608 | | float max_x_height_; |
609 | | uint8_t permuter_; // permuter code |
610 | | |
611 | | // Normally, the ratings_ matrix represents the recognition results in order |
612 | | // from left-to-right. However, some engines (say Cube) may return |
613 | | // recognition results in the order of the script's major reading direction |
614 | | // (for Arabic, that is right-to-left). |
615 | | bool unichars_in_script_order_; |
616 | | // True if NoDangerousAmbig found an ambiguity. |
617 | | bool dangerous_ambig_found_; |
618 | | |
619 | | // The following variables are populated and passed by reference any |
620 | | // time unichar_string() or unichar_lengths() are called. |
621 | | mutable std::string unichar_string_; |
622 | | mutable std::string unichar_lengths_; |
623 | | }; |
624 | | |
625 | | // Make WERD_CHOICE listable. |
626 | | ELISTIZEH(WERD_CHOICE) |
627 | | using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>; |
628 | | |
629 | | // Utilities for comparing WERD_CHOICEs |
630 | | |
631 | | bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2); |
632 | | |
633 | | // Utilities for debug printing. |
634 | | void print_ratings_list(const char *msg, // intro message |
635 | | BLOB_CHOICE_LIST *ratings, // list of results |
636 | | const UNICHARSET ¤t_unicharset // unicharset that can be used |
637 | | // for id-to-unichar conversion |
638 | | ); |
639 | | |
640 | | } // namespace tesseract |
641 | | |
642 | | #endif |