/src/tesseract/src/dict/dict.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: dict.cpp |
3 | | // Description: dict class. |
4 | | // Author: Samuel Charron |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #include "dict.h" |
20 | | |
21 | | #include "tesserrstream.h" // for tesserr |
22 | | #include "tprintf.h" |
23 | | |
24 | | #include <cstdio> |
25 | | |
26 | | namespace tesseract { |
27 | | |
28 | | class Image; |
29 | | |
30 | | Dict::Dict(CCUtil *ccutil) |
31 | 4 | : letter_is_okay_(&tesseract::Dict::def_letter_is_okay) |
32 | 4 | , probability_in_context_(&tesseract::Dict::def_probability_in_context) |
33 | 4 | , ccutil_(ccutil) |
34 | 4 | , wildcard_unichar_id_(INVALID_UNICHAR_ID) |
35 | 4 | , apostrophe_unichar_id_(INVALID_UNICHAR_ID) |
36 | 4 | , question_unichar_id_(INVALID_UNICHAR_ID) |
37 | 4 | , slash_unichar_id_(INVALID_UNICHAR_ID) |
38 | 4 | , hyphen_unichar_id_(INVALID_UNICHAR_ID) |
39 | 4 | , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.", |
40 | | getCCUtil()->params()) |
41 | 4 | , STRING_INIT_MEMBER(user_words_suffix, "", |
42 | | "A suffix of user-provided words located in tessdata.", |
43 | | getCCUtil()->params()) |
44 | 4 | , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.", |
45 | | getCCUtil()->params()) |
46 | 4 | , STRING_INIT_MEMBER(user_patterns_suffix, "", |
47 | | "A suffix of user-provided patterns located in " |
48 | | "tessdata.", |
49 | | getCCUtil()->params()) |
50 | 4 | , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params()) |
51 | 4 | , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params()) |
52 | 4 | , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", |
53 | | getCCUtil()->params()) |
54 | 4 | , BOOL_INIT_MEMBER(load_punc_dawg, true, |
55 | | "Load dawg with punctuation" |
56 | | " patterns.", |
57 | | getCCUtil()->params()) |
58 | 4 | , BOOL_INIT_MEMBER(load_number_dawg, true, |
59 | | "Load dawg with number" |
60 | | " patterns.", |
61 | | getCCUtil()->params()) |
62 | 4 | , BOOL_INIT_MEMBER(load_bigram_dawg, true, |
63 | | "Load dawg with special word " |
64 | | "bigrams.", |
65 | | getCCUtil()->params()) |
66 | 4 | , double_MEMBER(xheight_penalty_subscripts, 0.125, |
67 | | "Score penalty (0.1 = 10%) added if there are subscripts " |
68 | | "or superscripts in a word, but it is otherwise OK.", |
69 | | getCCUtil()->params()) |
70 | 4 | , double_MEMBER(xheight_penalty_inconsistent, 0.25, |
71 | | "Score penalty (0.1 = 10%) added if an xheight is " |
72 | | "inconsistent.", |
73 | | getCCUtil()->params()) |
74 | 4 | , double_MEMBER(segment_penalty_dict_frequent_word, 1.0, |
75 | | "Score multiplier for word matches which have good case and" |
76 | | " are frequent in the given language (lower is better).", |
77 | | getCCUtil()->params()) |
78 | 4 | , double_MEMBER(segment_penalty_dict_case_ok, 1.1, |
79 | | "Score multiplier for word matches that have good case " |
80 | | "(lower is better).", |
81 | | getCCUtil()->params()) |
82 | 4 | , double_MEMBER(segment_penalty_dict_case_bad, 1.3125, |
83 | | "Default score multiplier for word matches, which may have " |
84 | | "case issues (lower is better).", |
85 | | getCCUtil()->params()) |
86 | 4 | , double_MEMBER(segment_penalty_dict_nonword, 1.25, |
87 | | "Score multiplier for glyph fragment segmentations which " |
88 | | "do not match a dictionary word (lower is better).", |
89 | | getCCUtil()->params()) |
90 | 4 | , double_MEMBER(segment_penalty_garbage, 1.50, |
91 | | "Score multiplier for poorly cased strings that are not in" |
92 | | " the dictionary and generally look like garbage (lower is" |
93 | | " better).", |
94 | | getCCUtil()->params()) |
95 | 4 | , STRING_MEMBER(output_ambig_words_file, "", |
96 | | "Output file for ambiguities found in the dictionary", getCCUtil()->params()) |
97 | 4 | , INT_MEMBER(dawg_debug_level, 0, |
98 | | "Set to 1 for general debug info" |
99 | | ", to 2 for more details, to 3 to see all the debug messages", |
100 | | getCCUtil()->params()) |
101 | 4 | , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params()) |
102 | 4 | , BOOL_MEMBER(use_only_first_uft8_step, false, |
103 | | "Use only the first UTF8 step of the given string" |
104 | | " when computing log probabilities.", |
105 | | getCCUtil()->params()) |
106 | 4 | , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params()) |
107 | 4 | , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words", |
108 | | getCCUtil()->params()) |
109 | 4 | , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset", |
110 | | getCCUtil()->params()) |
111 | 4 | , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word", |
112 | | getCCUtil()->params()) |
113 | 4 | , double_MEMBER(stopper_certainty_per_char, -0.50, |
114 | | "Certainty to add" |
115 | | " for each dict char above small word size.", |
116 | | getCCUtil()->params()) |
117 | 4 | , double_MEMBER(stopper_allowable_character_badness, 3.0, |
118 | | "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params()) |
119 | 4 | , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params()) |
120 | 4 | , BOOL_MEMBER(stopper_no_acceptable_choices, false, |
121 | | "Make AcceptableChoice() always return false. Useful" |
122 | | " when there is a need to explore all segmentations", |
123 | | getCCUtil()->params()) |
124 | 4 | , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list", |
125 | | getCCUtil()->params()) |
126 | 4 | , STRING_MEMBER(word_to_debug, "", |
127 | | "Word for which stopper debug" |
128 | | " information should be printed to stdout", |
129 | | getCCUtil()->params()) |
130 | 4 | , BOOL_MEMBER(segment_nonalphabetic_script, false, |
131 | | "Don't use any alphabetic-specific tricks." |
132 | | " Set to true in the traineddata config file for" |
133 | | " scripts that are cursive or inherently fixed-pitch", |
134 | | getCCUtil()->params()) |
135 | 4 | , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params()) |
136 | 4 | , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary", |
137 | | getCCUtil()->params()) |
138 | 4 | , double_MEMBER(doc_dict_certainty_threshold, -2.25, |
139 | | "Worst certainty for words that can be inserted into the" |
140 | | " document dictionary", |
141 | | getCCUtil()->params()) |
142 | 4 | , INT_MEMBER(max_permuter_attempts, 10000, |
143 | | "Maximum number of different" |
144 | | " character choices to consider during permutation." |
145 | | " This limit is especially useful when user patterns" |
146 | | " are specified, since overly generic patterns can result in" |
147 | | " dawg search exploring an overly large number of options.", |
148 | 4 | getCCUtil()->params()) { |
149 | 4 | reject_offset_ = 0.0; |
150 | 4 | go_deeper_fxn_ = nullptr; |
151 | 4 | hyphen_word_ = nullptr; |
152 | 4 | last_word_on_line_ = false; |
153 | 4 | document_words_ = nullptr; |
154 | 4 | dawg_cache_ = nullptr; |
155 | 4 | dawg_cache_is_ours_ = false; |
156 | 4 | pending_words_ = nullptr; |
157 | 4 | bigram_dawg_ = nullptr; |
158 | 4 | freq_dawg_ = nullptr; |
159 | 4 | punc_dawg_ = nullptr; |
160 | 4 | unambig_dawg_ = nullptr; |
161 | 4 | wordseg_rating_adjust_factor_ = -1.0f; |
162 | 4 | output_ambig_words_file_ = nullptr; |
163 | 4 | } |
164 | | |
165 | 0 | Dict::~Dict() { |
166 | 0 | End(); |
167 | 0 | delete hyphen_word_; |
168 | 0 | if (output_ambig_words_file_ != nullptr) { |
169 | 0 | fclose(output_ambig_words_file_); |
170 | 0 | } |
171 | 0 | } |
172 | | |
173 | 4 | DawgCache *Dict::GlobalDawgCache() { |
174 | | // This global cache (a singleton) will outlive every Tesseract instance |
175 | | // (even those that someone else might declare as global static variables). |
176 | 4 | static DawgCache cache; |
177 | 4 | return &cache; |
178 | 4 | } |
179 | | |
180 | | // Sets up ready for a Load or LoadLSTM. |
181 | 4 | void Dict::SetupForLoad(DawgCache *dawg_cache) { |
182 | 4 | if (dawgs_.size() != 0) { |
183 | 2 | this->End(); |
184 | 2 | } |
185 | | |
186 | 4 | apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); |
187 | 4 | question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); |
188 | 4 | slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); |
189 | 4 | hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); |
190 | | |
191 | 4 | if (dawg_cache != nullptr) { |
192 | 4 | dawg_cache_ = dawg_cache; |
193 | 4 | dawg_cache_is_ours_ = false; |
194 | 4 | } else { |
195 | 0 | dawg_cache_ = new DawgCache(); |
196 | 0 | dawg_cache_is_ours_ = true; |
197 | 0 | } |
198 | 4 | } |
199 | | |
200 | | // Loads the dawgs needed by Tesseract. Call FinishLoad() after. |
201 | 2 | void Dict::Load(const std::string &lang, TessdataManager *data_file) { |
202 | | // Load dawgs_. |
203 | 2 | if (load_punc_dawg) { |
204 | 2 | punc_dawg_ = |
205 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file); |
206 | 2 | if (punc_dawg_) { |
207 | 2 | dawgs_.push_back(punc_dawg_); |
208 | 2 | } |
209 | 2 | } |
210 | 2 | if (load_system_dawg) { |
211 | 2 | Dawg *system_dawg = |
212 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); |
213 | 2 | if (system_dawg) { |
214 | 2 | dawgs_.push_back(system_dawg); |
215 | 2 | } |
216 | 2 | } |
217 | 2 | if (load_number_dawg) { |
218 | 2 | Dawg *number_dawg = |
219 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); |
220 | 2 | if (number_dawg) { |
221 | 2 | dawgs_.push_back(number_dawg); |
222 | 2 | } |
223 | 2 | } |
224 | 2 | if (load_bigram_dawg) { |
225 | 2 | bigram_dawg_ = |
226 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file); |
227 | | // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the |
228 | | // dawgs_!! |
229 | 2 | } |
230 | 2 | if (load_freq_dawg) { |
231 | 2 | freq_dawg_ = |
232 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file); |
233 | 2 | if (freq_dawg_) { |
234 | 2 | dawgs_.push_back(freq_dawg_); |
235 | 2 | } |
236 | 2 | } |
237 | 2 | if (load_unambig_dawg) { |
238 | 2 | unambig_dawg_ = |
239 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file); |
240 | 2 | if (unambig_dawg_) { |
241 | 0 | dawgs_.push_back(unambig_dawg_); |
242 | 0 | } |
243 | 2 | } |
244 | | |
245 | 2 | std::string name; |
246 | 2 | if (!user_words_suffix.empty() || !user_words_file.empty()) { |
247 | 0 | Trie *trie_ptr = |
248 | 0 | new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); |
249 | 0 | if (!user_words_file.empty()) { |
250 | 0 | name = user_words_file; |
251 | 0 | } else { |
252 | 0 | name = getCCUtil()->language_data_path_prefix; |
253 | 0 | name += user_words_suffix; |
254 | 0 | } |
255 | 0 | if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), |
256 | 0 | Trie::RRP_REVERSE_IF_HAS_RTL)) { |
257 | 0 | tprintf("Error: failed to load %s\n", name.c_str()); |
258 | 0 | delete trie_ptr; |
259 | 0 | } else { |
260 | 0 | dawgs_.push_back(trie_ptr); |
261 | 0 | } |
262 | 0 | } |
263 | | |
264 | 2 | if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { |
265 | 0 | Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), |
266 | 0 | dawg_debug_level); |
267 | 0 | trie_ptr->initialize_patterns(&(getUnicharset())); |
268 | 0 | if (!user_patterns_file.empty()) { |
269 | 0 | name = user_patterns_file; |
270 | 0 | } else { |
271 | 0 | name = getCCUtil()->language_data_path_prefix; |
272 | 0 | name += user_patterns_suffix; |
273 | 0 | } |
274 | 0 | if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { |
275 | 0 | tprintf("Error: failed to load %s\n", name.c_str()); |
276 | 0 | delete trie_ptr; |
277 | 0 | } else { |
278 | 0 | dawgs_.push_back(trie_ptr); |
279 | 0 | } |
280 | 0 | } |
281 | | |
282 | 2 | document_words_ = |
283 | 2 | new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level); |
284 | 2 | dawgs_.push_back(document_words_); |
285 | | |
286 | | // This dawg is temporary and should not be searched by letter_is_ok. |
287 | 2 | pending_words_ = |
288 | 2 | new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level); |
289 | 2 | } |
290 | | |
291 | | // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. |
292 | 2 | void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { |
293 | | // Load dawgs_. |
294 | 2 | if (load_punc_dawg) { |
295 | 2 | punc_dawg_ = |
296 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file); |
297 | 2 | if (punc_dawg_) { |
298 | 2 | dawgs_.push_back(punc_dawg_); |
299 | 2 | } |
300 | 2 | } |
301 | 2 | if (load_system_dawg) { |
302 | 2 | Dawg *system_dawg = |
303 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); |
304 | 2 | if (system_dawg) { |
305 | 2 | dawgs_.push_back(system_dawg); |
306 | 2 | } |
307 | 2 | } |
308 | 2 | if (load_number_dawg) { |
309 | 2 | Dawg *number_dawg = |
310 | 2 | dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); |
311 | 2 | if (number_dawg) { |
312 | 2 | dawgs_.push_back(number_dawg); |
313 | 2 | } |
314 | 2 | } |
315 | | |
316 | | // stolen from Dict::Load (but needs params_ from Tesseract |
317 | | // langdata/config/api): |
318 | 2 | std::string name; |
319 | 2 | if (!user_words_suffix.empty() || !user_words_file.empty()) { |
320 | 0 | Trie *trie_ptr = |
321 | 0 | new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); |
322 | 0 | if (!user_words_file.empty()) { |
323 | 0 | name = user_words_file; |
324 | 0 | } else { |
325 | 0 | name = getCCUtil()->language_data_path_prefix; |
326 | 0 | name += user_words_suffix; |
327 | 0 | } |
328 | 0 | if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), |
329 | 0 | Trie::RRP_REVERSE_IF_HAS_RTL)) { |
330 | 0 | tprintf("Error: failed to load %s\n", name.c_str()); |
331 | 0 | delete trie_ptr; |
332 | 0 | } else { |
333 | 0 | dawgs_.push_back(trie_ptr); |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | 2 | if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { |
338 | 0 | Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), |
339 | 0 | dawg_debug_level); |
340 | 0 | trie_ptr->initialize_patterns(&(getUnicharset())); |
341 | 0 | if (!user_patterns_file.empty()) { |
342 | 0 | name = user_patterns_file; |
343 | 0 | } else { |
344 | 0 | name = getCCUtil()->language_data_path_prefix; |
345 | 0 | name += user_patterns_suffix; |
346 | 0 | } |
347 | 0 | if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { |
348 | 0 | tprintf("Error: failed to load %s\n", name.c_str()); |
349 | 0 | delete trie_ptr; |
350 | 0 | } else { |
351 | 0 | dawgs_.push_back(trie_ptr); |
352 | 0 | } |
353 | 0 | } |
354 | 2 | } |
355 | | |
356 | | // Completes the loading process after Load() and/or LoadLSTM(). |
357 | | // Returns false if no dictionaries were loaded. |
358 | 4 | bool Dict::FinishLoad() { |
359 | 4 | if (dawgs_.empty()) { |
360 | 0 | return false; |
361 | 0 | } |
362 | | // Construct a list of corresponding successors for each dawg. Each entry, i, |
363 | | // in the successors_ vector is a vector of integers that represent the |
364 | | // indices into the dawgs_ vector of the successors for dawg i. |
365 | 4 | successors_.reserve(dawgs_.size()); |
366 | 16 | for (auto dawg : dawgs_) { |
367 | 16 | auto *lst = new SuccessorList(); |
368 | 84 | for (unsigned j = 0; j < dawgs_.size(); ++j) { |
369 | 68 | const Dawg *other = dawgs_[j]; |
370 | 68 | if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) && |
371 | 68 | kDawgSuccessors[dawg->type()][other->type()]) { |
372 | 24 | lst->push_back(j); |
373 | 24 | } |
374 | 68 | } |
375 | 16 | successors_.push_back(lst); |
376 | 16 | } |
377 | 4 | return true; |
378 | 4 | } |
379 | | |
380 | 2 | void Dict::End() { |
381 | 2 | if (dawgs_.empty()) { |
382 | 0 | return; // Not safe to call twice. |
383 | 0 | } |
384 | 6 | for (auto &dawg : dawgs_) { |
385 | 6 | if (!dawg_cache_->FreeDawg(dawg)) { |
386 | 0 | delete dawg; |
387 | 0 | } |
388 | 6 | } |
389 | 2 | dawg_cache_->FreeDawg(bigram_dawg_); |
390 | 2 | if (dawg_cache_is_ours_) { |
391 | 0 | delete dawg_cache_; |
392 | 0 | dawg_cache_ = nullptr; |
393 | 0 | } |
394 | 6 | for (auto successor : successors_) { |
395 | 6 | delete successor; |
396 | 6 | } |
397 | 2 | dawgs_.clear(); |
398 | 2 | successors_.clear(); |
399 | 2 | document_words_ = nullptr; |
400 | 2 | delete pending_words_; |
401 | 2 | pending_words_ = nullptr; |
402 | 2 | } |
403 | | |
404 | | // Returns true if in light of the current state unichar_id is allowed |
405 | | // according to at least one of the dawgs in the dawgs_ vector. |
406 | | // See more extensive comments in dict.h where this function is declared. |
407 | | int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, |
408 | 2.74M | UNICHAR_ID unichar_id, bool word_end) const { |
409 | 2.74M | auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args); |
410 | | |
411 | 2.74M | ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); |
412 | | |
413 | 2.74M | if (dawg_debug_level >= 3) { |
414 | 0 | tesserr << "def_letter_is_okay: current unichar=" |
415 | 0 | << getUnicharset().debug_str(unichar_id) |
416 | 0 | << " word_end=" << word_end |
417 | 0 | << " num active dawgs=" << dawg_args->active_dawgs->size() << '\n'; |
418 | 0 | } |
419 | | |
420 | | // Do not accept words that contain kPatternUnicharID. |
421 | | // (otherwise pattern dawgs would not function correctly). |
422 | | // Do not accept words containing INVALID_UNICHAR_IDs. |
423 | 2.74M | if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) { |
424 | 3.43k | dawg_args->permuter = NO_PERM; |
425 | 3.43k | return NO_PERM; |
426 | 3.43k | } |
427 | | |
428 | | // Initialization. |
429 | 2.73M | PermuterType curr_perm = NO_PERM; |
430 | 2.73M | dawg_args->updated_dawgs->clear(); |
431 | 2.73M | dawg_args->valid_end = false; |
432 | | |
433 | | // Go over the active_dawgs vector and insert DawgPosition records |
434 | | // with the updated ref (an edge with the corresponding unichar id) into |
435 | | // dawg_args->updated_pos. |
436 | 5.12M | for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) { |
437 | 2.38M | const DawgPosition &pos = (*dawg_args->active_dawgs)[a]; |
438 | 2.38M | const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; |
439 | 2.38M | const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; |
440 | | |
441 | 2.38M | if (!dawg && !punc_dawg) { |
442 | | // shouldn't happen. |
443 | 0 | tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); |
444 | 0 | continue; |
445 | 0 | } |
446 | 2.38M | if (!dawg) { |
447 | | // We're in the punctuation dawg. A core dawg has not been chosen. |
448 | 711k | NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); |
449 | 711k | EDGE_REF punc_transition_edge = |
450 | 711k | punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end); |
451 | 711k | if (punc_transition_edge != NO_EDGE) { |
452 | | // Find all successors, and see which can transition. |
453 | 711k | const SuccessorList &slist = *(successors_[pos.punc_index]); |
454 | 2.84M | for (int sdawg_index : slist) { |
455 | 2.84M | const Dawg *sdawg = dawgs_[sdawg_index]; |
456 | 2.84M | UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); |
457 | 2.84M | EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); |
458 | 2.84M | if (dawg_edge != NO_EDGE) { |
459 | 795k | if (dawg_debug_level >= 3) { |
460 | 0 | tprintf("Letter found in dawg %d\n", sdawg_index); |
461 | 0 | } |
462 | 795k | dawg_args->updated_dawgs->add_unique( |
463 | 795k | DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false), |
464 | 795k | dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: "); |
465 | 795k | if (sdawg->permuter() > curr_perm) { |
466 | 622k | curr_perm = sdawg->permuter(); |
467 | 622k | } |
468 | 795k | if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) { |
469 | 515k | dawg_args->valid_end = true; |
470 | 515k | } |
471 | 795k | } |
472 | 2.84M | } |
473 | 711k | } |
474 | 711k | EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end); |
475 | 711k | if (punc_edge != NO_EDGE) { |
476 | 9.62k | if (dawg_debug_level >= 3) { |
477 | 0 | tprintf("Letter found in punctuation dawg\n"); |
478 | 0 | } |
479 | 9.62k | dawg_args->updated_dawgs->add_unique( |
480 | 9.62k | DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0, |
481 | 9.62k | "Extend punctuation dawg: "); |
482 | 9.62k | if (PUNC_PERM > curr_perm) { |
483 | 9.62k | curr_perm = PUNC_PERM; |
484 | 9.62k | } |
485 | 9.62k | if (punc_dawg->end_of_word(punc_edge)) { |
486 | 0 | dawg_args->valid_end = true; |
487 | 0 | } |
488 | 9.62k | } |
489 | 711k | continue; |
490 | 711k | } |
491 | | |
492 | 1.67M | if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { |
493 | | // We can end the main word here. |
494 | | // If we can continue on the punc ref, add that possibility. |
495 | 1.02M | NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); |
496 | 1.02M | EDGE_REF punc_edge = |
497 | 1.02M | punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); |
498 | 1.02M | if (punc_edge != NO_EDGE) { |
499 | 140k | dawg_args->updated_dawgs->add_unique( |
500 | 140k | DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true), |
501 | 140k | dawg_debug_level > 0, "Return to punctuation dawg: "); |
502 | 140k | if (dawg->permuter() > curr_perm) { |
503 | 135k | curr_perm = dawg->permuter(); |
504 | 135k | } |
505 | 140k | if (punc_dawg->end_of_word(punc_edge)) { |
506 | 138k | dawg_args->valid_end = true; |
507 | 138k | } |
508 | 140k | } |
509 | 1.02M | } |
510 | | |
511 | 1.67M | if (pos.back_to_punc) { |
512 | 129k | continue; |
513 | 129k | } |
514 | | |
515 | | // If we are dealing with the pattern dawg, look up all the |
516 | | // possible edges, not only for the exact unichar_id, but also |
517 | | // for all its character classes (alpha, digit, etc). |
518 | 1.54M | if (dawg->type() == DAWG_TYPE_PATTERN) { |
519 | 0 | ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm); |
520 | | // There can't be any successors to dawg that is of type |
521 | | // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. |
522 | 0 | continue; |
523 | 0 | } |
524 | | |
525 | | // Find the edge out of the node for the unichar_id. |
526 | 1.54M | NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); |
527 | 1.54M | EDGE_REF edge = |
528 | 1.54M | (node == NO_EDGE) |
529 | 1.54M | ? NO_EDGE |
530 | 1.54M | : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end); |
531 | | |
532 | 1.54M | if (dawg_debug_level >= 3) { |
533 | 0 | tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge); |
534 | 0 | } |
535 | | |
536 | 1.54M | if (edge != NO_EDGE) { // the unichar was found in the current dawg |
537 | 136k | if (dawg_debug_level >= 3) { |
538 | 0 | tprintf("Letter found in dawg %d\n", pos.dawg_index); |
539 | 0 | } |
540 | 136k | if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { |
541 | 0 | if (dawg_debug_level >= 3) { |
542 | 0 | tprintf("Punctuation constraint not satisfied at end of word.\n"); |
543 | 0 | } |
544 | 0 | continue; |
545 | 0 | } |
546 | 136k | if (dawg->permuter() > curr_perm) { |
547 | 124k | curr_perm = dawg->permuter(); |
548 | 124k | } |
549 | 136k | if (dawg->end_of_word(edge) && |
550 | 136k | (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) { |
551 | 95.6k | dawg_args->valid_end = true; |
552 | 95.6k | } |
553 | 136k | dawg_args->updated_dawgs->add_unique( |
554 | 136k | DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false), |
555 | 136k | dawg_debug_level > 0, "Append current dawg to updated active dawgs: "); |
556 | 136k | } |
557 | 1.54M | } // end for |
558 | | // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM |
559 | | // or if we found the current letter in a non-punctuation dawg. This |
560 | | // allows preserving information on which dawg the "core" word came from. |
561 | | // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. |
562 | 2.73M | if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || |
563 | 2.73M | (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { |
564 | 2.73M | dawg_args->permuter = curr_perm; |
565 | 2.73M | } |
566 | 2.73M | if (dawg_debug_level >= 2) { |
567 | 0 | tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter); |
568 | 0 | } |
569 | 2.73M | return dawg_args->permuter; |
570 | 2.74M | } |
571 | | |
572 | | void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id, |
573 | 0 | bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const { |
574 | 0 | NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); |
575 | | // Try to find the edge corresponding to the exact unichar_id and to all the |
576 | | // edges corresponding to the character class of unichar_id. |
577 | 0 | std::vector<UNICHAR_ID> unichar_id_patterns; |
578 | 0 | unichar_id_patterns.push_back(unichar_id); |
579 | 0 | dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns); |
580 | 0 | for (int unichar_id_pattern : unichar_id_patterns) { |
581 | | // On the first iteration check all the outgoing edges. |
582 | | // On the second iteration check all self-loops. |
583 | 0 | for (int k = 0; k < 2; ++k) { |
584 | 0 | EDGE_REF edge = (k == 0) |
585 | 0 | ? dawg->edge_char_of(node, unichar_id_pattern, word_end) |
586 | 0 | : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end); |
587 | 0 | if (edge == NO_EDGE) { |
588 | 0 | continue; |
589 | 0 | } |
590 | 0 | if (dawg_debug_level >= 3) { |
591 | 0 | tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, |
592 | 0 | edge); |
593 | 0 | tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); |
594 | 0 | } |
595 | 0 | if (dawg->permuter() > *curr_perm) { |
596 | 0 | *curr_perm = dawg->permuter(); |
597 | 0 | } |
598 | 0 | if (dawg->end_of_word(edge)) { |
599 | 0 | dawg_args->valid_end = true; |
600 | 0 | } |
601 | 0 | dawg_args->updated_dawgs->add_unique( |
602 | 0 | DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc), |
603 | 0 | dawg_debug_level > 0, "Append current dawg to updated active dawgs: "); |
604 | 0 | } |
605 | 0 | } |
606 | 0 | } |
607 | | |
608 | | // Fill the given active_dawgs vector with dawgs that could contain the |
609 | | // beginning of the word. If hyphenated() returns true, copy the entries |
610 | | // from hyphen_active_dawgs_ instead. |
611 | 358k | void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const { |
612 | 358k | if (hyphenated()) { |
613 | 0 | *active_dawgs = hyphen_active_dawgs_; |
614 | 0 | if (dawg_debug_level >= 3) { |
615 | 0 | for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) { |
616 | 0 | tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", |
617 | 0 | hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref); |
618 | 0 | } |
619 | 0 | } |
620 | 358k | } else { |
621 | 358k | default_dawgs(active_dawgs, ambigs_mode); |
622 | 358k | } |
623 | 358k | } |
624 | | |
625 | 1.02M | void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const { |
626 | 1.02M | bool punc_dawg_available = (punc_dawg_ != nullptr) && |
627 | 1.02M | punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; |
628 | | |
629 | 4.64M | for (unsigned i = 0; i < dawgs_.size(); i++) { |
630 | 3.62M | if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { |
631 | 3.62M | int dawg_ty = dawgs_[i]->type(); |
632 | 3.62M | bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; |
633 | 3.62M | if (dawg_ty == DAWG_TYPE_PUNCTUATION) { |
634 | 724k | dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false)); |
635 | 724k | if (dawg_debug_level >= 3) { |
636 | 0 | tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); |
637 | 0 | } |
638 | 2.89M | } else if (!punc_dawg_available || !subsumed_by_punc) { |
639 | 0 | dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false)); |
640 | 0 | if (dawg_debug_level >= 3) { |
641 | 0 | tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); |
642 | 0 | } |
643 | 0 | } |
644 | 3.62M | } |
645 | 3.62M | } |
646 | 1.02M | } |
647 | | |
648 | 7.50k | void Dict::add_document_word(const WERD_CHOICE &best_choice) { |
649 | | // Do not add hyphenated word parts to the document dawg. |
650 | | // hyphen_word_ will be non-nullptr after the set_hyphen_word() is |
651 | | // called when the first part of the hyphenated word is |
652 | | // discovered and while the second part of the word is recognized. |
653 | | // hyphen_word_ is cleared in cc_recg() before the next word on |
654 | | // the line is recognized. |
655 | 7.50k | if (hyphen_word_) { |
656 | 0 | return; |
657 | 0 | } |
658 | | |
659 | 7.50k | int stringlen = best_choice.length(); |
660 | | |
661 | 7.50k | if (valid_word(best_choice) || stringlen < 2) { |
662 | 6.10k | return; |
663 | 6.10k | } |
664 | | |
665 | | // Discard words that contain >= kDocDictMaxRepChars repeating unichars. |
666 | 1.39k | if (best_choice.length() >= kDocDictMaxRepChars) { |
667 | 1.04k | int num_rep_chars = 1; |
668 | 1.04k | UNICHAR_ID uch_id = best_choice.unichar_id(0); |
669 | 7.10k | for (unsigned i = 1; i < best_choice.length(); ++i) { |
670 | 6.40k | if (best_choice.unichar_id(i) != uch_id) { |
671 | 4.62k | num_rep_chars = 1; |
672 | 4.62k | uch_id = best_choice.unichar_id(i); |
673 | 4.62k | } else { |
674 | 1.78k | ++num_rep_chars; |
675 | 1.78k | if (num_rep_chars == kDocDictMaxRepChars) { |
676 | 340 | return; |
677 | 340 | } |
678 | 1.78k | } |
679 | 6.40k | } |
680 | 1.04k | } |
681 | | |
682 | 1.05k | if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) { |
683 | 1.05k | if (best_choice.certainty() < doc_dict_pending_threshold) { |
684 | 1.05k | return; |
685 | 1.05k | } |
686 | | |
687 | 0 | if (!pending_words_->word_in_dawg(best_choice)) { |
688 | 0 | if (stringlen > 2 || |
689 | 0 | (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) && |
690 | 0 | getUnicharset().get_isupper(best_choice.unichar_id(1)))) { |
691 | 0 | pending_words_->add_word_to_dawg(best_choice); |
692 | 0 | } |
693 | 0 | return; |
694 | 0 | } |
695 | 0 | } |
696 | | |
697 | 2 | if (save_doc_words) { |
698 | 0 | std::string filename(getCCUtil()->imagefile); |
699 | 0 | filename += ".doc"; |
700 | 0 | FILE *doc_word_file = fopen(filename.c_str(), "a"); |
701 | 0 | if (doc_word_file == nullptr) { |
702 | 0 | tprintf("Error: Could not open file %s\n", filename.c_str()); |
703 | 0 | ASSERT_HOST(doc_word_file); |
704 | 0 | } |
705 | 0 | fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str()); |
706 | 0 | fclose(doc_word_file); |
707 | 0 | } |
708 | 2 | document_words_->add_word_to_dawg(best_choice); |
709 | 2 | } |
710 | | |
711 | | void Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, |
712 | 331k | float additional_adjust, bool modify_rating, bool debug) { |
713 | 331k | bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && |
714 | 331k | word->GetTopScriptID() == getUnicharset().han_sid()); |
715 | 331k | bool case_is_ok = (is_han || case_ok(*word)); |
716 | 331k | bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); |
717 | | |
718 | 331k | float adjust_factor = additional_adjust; |
719 | 331k | float new_rating = word->rating(); |
720 | 331k | new_rating += kRatingPad; |
721 | 331k | const char *xheight_triggered = ""; |
722 | 331k | if (word->length() > 1) { |
723 | | // Calculate x-height and y-offset consistency penalties. |
724 | 297k | switch (xheight_consistency) { |
725 | 209k | case XH_INCONSISTENT: |
726 | 209k | adjust_factor += xheight_penalty_inconsistent; |
727 | 209k | xheight_triggered = ", xhtBAD"; |
728 | 209k | break; |
729 | 51.8k | case XH_SUBNORMAL: |
730 | 51.8k | adjust_factor += xheight_penalty_subscripts; |
731 | 51.8k | xheight_triggered = ", xhtSUB"; |
732 | 51.8k | break; |
733 | 36.3k | case XH_GOOD: |
734 | | // leave the factor alone - all good! |
735 | 36.3k | break; |
736 | 297k | } |
737 | | // TODO(eger): if nonword is true, but there is a "core" that is a dict |
738 | | // word, negate nonword status. |
739 | 297k | } else { |
740 | 34.0k | if (debug) { |
741 | 0 | tprintf("Consistency could not be calculated.\n"); |
742 | 0 | } |
743 | 34.0k | } |
744 | 331k | if (debug) { |
745 | 0 | tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(), |
746 | 0 | word->rating(), xheight_triggered); |
747 | 0 | } |
748 | | |
749 | 331k | if (nonword) { // non-dictionary word |
750 | 331k | if (case_is_ok && punc_is_ok) { |
751 | 0 | adjust_factor += segment_penalty_dict_nonword; |
752 | 0 | new_rating *= adjust_factor; |
753 | 0 | if (debug) { |
754 | 0 | tprintf(", W"); |
755 | 0 | } |
756 | 331k | } else { |
757 | 331k | adjust_factor += segment_penalty_garbage; |
758 | 331k | new_rating *= adjust_factor; |
759 | 331k | if (debug) { |
760 | 0 | if (!case_is_ok) { |
761 | 0 | tprintf(", C"); |
762 | 0 | } |
763 | 0 | if (!punc_is_ok) { |
764 | 0 | tprintf(", P"); |
765 | 0 | } |
766 | 0 | } |
767 | 331k | } |
768 | 331k | } else { // dictionary word |
769 | 0 | if (case_is_ok) { |
770 | 0 | if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) { |
771 | 0 | word->set_permuter(FREQ_DAWG_PERM); |
772 | 0 | adjust_factor += segment_penalty_dict_frequent_word; |
773 | 0 | new_rating *= adjust_factor; |
774 | 0 | if (debug) { |
775 | 0 | tprintf(", F"); |
776 | 0 | } |
777 | 0 | } else { |
778 | 0 | adjust_factor += segment_penalty_dict_case_ok; |
779 | 0 | new_rating *= adjust_factor; |
780 | 0 | if (debug) { |
781 | 0 | tprintf(", "); |
782 | 0 | } |
783 | 0 | } |
784 | 0 | } else { |
785 | 0 | adjust_factor += segment_penalty_dict_case_bad; |
786 | 0 | new_rating *= adjust_factor; |
787 | 0 | if (debug) { |
788 | 0 | tprintf(", C"); |
789 | 0 | } |
790 | 0 | } |
791 | 0 | } |
792 | 331k | new_rating -= kRatingPad; |
793 | 331k | if (modify_rating) { |
794 | 0 | word->set_rating(new_rating); |
795 | 0 | } |
796 | 331k | if (debug) { |
797 | 0 | tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); |
798 | 0 | } |
799 | 331k | word->set_adjust_factor(adjust_factor); |
800 | 331k | } |
801 | | |
802 | 101k | int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { |
803 | 101k | const WERD_CHOICE *word_ptr = &word; |
804 | 101k | WERD_CHOICE temp_word(word.unicharset()); |
805 | 101k | if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { |
806 | 0 | copy_hyphen_info(&temp_word); |
807 | 0 | temp_word += word; |
808 | 0 | word_ptr = &temp_word; |
809 | 0 | } |
810 | 101k | if (word_ptr->empty()) { |
811 | 0 | return NO_PERM; |
812 | 0 | } |
813 | | // Allocate vectors for holding current and updated |
814 | | // active_dawgs and initialize them. |
815 | 101k | DawgPositionVector active_dawgs[2]; |
816 | 101k | init_active_dawgs(&(active_dawgs[0]), false); |
817 | 101k | DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); |
818 | 101k | int last_index = word_ptr->length() - 1; |
819 | | // Call letter_is_okay for each letter in the word. |
820 | 215k | for (int i = hyphen_base_size(); i <= last_index; ++i) { |
821 | 147k | if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i), |
822 | 147k | i == last_index))) { |
823 | 32.1k | break; |
824 | 32.1k | } |
825 | | // Swap active_dawgs, constraints with the corresponding updated vector. |
826 | 114k | if (dawg_args.updated_dawgs == &(active_dawgs[1])) { |
827 | 93.6k | dawg_args.updated_dawgs = &(active_dawgs[0]); |
828 | 93.6k | ++(dawg_args.active_dawgs); |
829 | 93.6k | } else { |
830 | 21.2k | ++(dawg_args.updated_dawgs); |
831 | 21.2k | dawg_args.active_dawgs = &(active_dawgs[0]); |
832 | 21.2k | } |
833 | 114k | } |
834 | 101k | return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM; |
835 | 101k | } |
836 | | |
837 | 0 | bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const { |
838 | 0 | if (bigram_dawg_ == nullptr) { |
839 | 0 | return false; |
840 | 0 | } |
841 | | |
842 | | // Extract the core word from the middle of each word with any digits |
843 | | // replaced with question marks. |
844 | 0 | unsigned w1start, w1end, w2start, w2end; |
845 | 0 | word1.punct_stripped(&w1start, &w1end); |
846 | 0 | word2.punct_stripped(&w2start, &w2end); |
847 | | |
848 | | // We don't want to penalize a single guillemet, hyphen, etc. |
849 | | // But our bigram list doesn't have any information about punctuation. |
850 | 0 | if (w1start >= w1end) { |
851 | 0 | return word1.length() < 3; |
852 | 0 | } |
853 | 0 | if (w2start >= w2end) { |
854 | 0 | return word2.length() < 3; |
855 | 0 | } |
856 | | |
857 | 0 | const UNICHARSET &uchset = getUnicharset(); |
858 | 0 | std::vector<UNICHAR_ID> bigram_string; |
859 | 0 | bigram_string.reserve(w1end + w2end + 1); |
860 | 0 | for (auto i = w1start; i < w1end; i++) { |
861 | 0 | const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i)); |
862 | 0 | if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) { |
863 | 0 | bigram_string.push_back(question_unichar_id_); |
864 | 0 | } else { |
865 | 0 | bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); |
866 | 0 | } |
867 | 0 | } |
868 | 0 | bigram_string.push_back(UNICHAR_SPACE); |
869 | 0 | for (auto i = w2start; i < w2end; i++) { |
870 | 0 | const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i)); |
871 | 0 | if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) { |
872 | 0 | bigram_string.push_back(question_unichar_id_); |
873 | 0 | } else { |
874 | 0 | bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); |
875 | 0 | } |
876 | 0 | } |
877 | 0 | WERD_CHOICE normalized_word(&uchset, bigram_string.size()); |
878 | 0 | for (int i : bigram_string) { |
879 | 0 | normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f); |
880 | 0 | } |
881 | 0 | return bigram_dawg_->word_in_dawg(normalized_word); |
882 | 0 | } |
883 | | |
884 | 331k | bool Dict::valid_punctuation(const WERD_CHOICE &word) { |
885 | 331k | if (word.empty()) { |
886 | 0 | return NO_PERM; |
887 | 0 | } |
888 | 331k | WERD_CHOICE new_word(word.unicharset()); |
889 | 331k | auto last_index = word.length() - 1; |
890 | 331k | int new_len; |
891 | 3.35M | for (unsigned i = 0; i <= last_index; ++i) { |
892 | 3.05M | UNICHAR_ID unichar_id = (word.unichar_id(i)); |
893 | 3.05M | if (getUnicharset().get_ispunctuation(unichar_id)) { |
894 | 845k | new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); |
895 | 2.20M | } else if (!getUnicharset().get_isalpha(unichar_id) && |
896 | 2.20M | !getUnicharset().get_isdigit(unichar_id)) { |
897 | 29.6k | return false; // neither punc, nor alpha, nor digit |
898 | 2.17M | } else if ((new_len = new_word.length()) == 0 || |
899 | 2.17M | new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) { |
900 | 645k | new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); |
901 | 645k | } |
902 | 3.05M | } |
903 | 301k | for (unsigned i = 0; i < dawgs_.size(); ++i) { |
904 | 0 | if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && |
905 | 0 | dawgs_[i]->word_in_dawg(new_word)) { |
906 | 0 | return true; |
907 | 0 | } |
908 | 0 | } |
909 | 301k | return false; |
910 | 301k | } |
911 | | |
912 | | /// Returns true if the language is space-delimited (not CJ, or T). |
913 | 1 | bool Dict::IsSpaceDelimitedLang() const { |
914 | 1 | const UNICHARSET &u_set = getUnicharset(); |
915 | 1 | if (u_set.han_sid() > 0) { |
916 | 0 | return false; |
917 | 0 | } |
918 | 1 | if (u_set.katakana_sid() > 0) { |
919 | 0 | return false; |
920 | 0 | } |
921 | 1 | if (u_set.thai_sid() > 0) { |
922 | 0 | return false; |
923 | 0 | } |
924 | 1 | return true; |
925 | 1 | } |
926 | | |
927 | | } // namespace tesseract |