/src/aspell/modules/speller/default/suggest.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2000-2005 by Kevin Atkinson under the terms of the LGPL |
2 | | |
3 | | // suggest.cpp Suggestion code for Aspell |
4 | | |
5 | | // The magic behind my spell checker comes from merging Lawrence |
6 | | // Philips excellent metaphone algorithm and Ispell's near miss |
7 | | // strategy which is inserting a space or hyphen, interchanging two |
8 | | // adjacent letters, changing one letter, deleting a letter, or adding |
9 | | // a letter. |
10 | | // |
11 | | // The process goes something like this. |
12 | | // |
13 | | // 1. Convert the misspelled word to its soundslike equivalent (its |
14 | | // metaphone for English words). |
15 | | // |
16 | | // 2. Find words that have the same soundslike pattern. |
17 | | // |
18 | | // 3. Find words that have similar soundslike patterns. A similar |
19 | | // soundlike pattern is a pattern that is obtained by |
20 | | // interchanging two adjacent letters, changing one letter, |
21 | | // deleting a letter, or adding a letter. |
22 | | // |
23 | | // 4. Score the result list and return the words with the lowest |
24 | | // score. The score is roughly the weighed average of the edit |
25 | | // distance of the word to the misspelled word, the soundslike |
26 | | // equivalent of the two words, and the phoneme of the two words. |
27 | | // The edit distance is the weighed total of the number of |
28 | | // deletions, insertions, exchanges, or adjacent swaps needed to |
29 | | // make one string equivalent to the other. |
30 | | // |
31 | | // Please note that the soundlike equivalent is a rough approximation |
32 | | // of how the words sounds. It is not the phoneme of the word by any |
33 | | // means. For more information on the metaphone algorithm please see |
34 | | // the file metaphone.cc which included a detailed description of it. |
35 | | // |
36 | | // NOTE: It is assumed that that strlen(soundslike) <= strlen(word) |
37 | | // for any possible word |
38 | | |
39 | | // POSSIBLE OPTIMIZATION: |
40 | | // store the number of letters that are the same as the previous |
41 | | // soundslike so that it can possible be skipped |
42 | | |
43 | | #include "getdata.hpp" |
44 | | |
45 | | #include "fstream.hpp" |
46 | | |
47 | | #include "speller_impl.hpp" |
48 | | #include "asuggest.hpp" |
49 | | #include "basic_list.hpp" |
50 | | #include "clone_ptr-t.hpp" |
51 | | #include "config.hpp" |
52 | | #include "data.hpp" |
53 | | #include "editdist.hpp" |
54 | | #include "editdist2.hpp" |
55 | | #include "errors.hpp" |
56 | | #include "file_data_util.hpp" |
57 | | #include "hash-t.hpp" |
58 | | #include "language.hpp" |
59 | | #include "leditdist.hpp" |
60 | | #include "speller_impl.hpp" |
61 | | #include "stack_ptr.hpp" |
62 | | #include "suggest.hpp" |
63 | | #include "vararray.hpp" |
64 | | #include "string_list.hpp" |
65 | | |
66 | | #include "gettext.h" |
67 | | |
68 | | //#include "iostream.hpp" |
69 | | //#define DEBUG_SUGGEST |
70 | | |
71 | | using namespace aspeller; |
72 | | using namespace acommon; |
73 | | using std::pair; |
74 | | |
75 | | namespace { |
76 | | |
77 | | template <class Iterator> |
78 | 655k | inline Iterator preview_next (Iterator i) { |
79 | 655k | return ++i; |
80 | 655k | } |
81 | | |
82 | | // |
83 | | // OriginalWord stores information about the original misspelled word |
84 | | // for convince and speed. |
85 | | // |
86 | | struct OriginalWord { |
87 | | String word; |
88 | | String lower; |
89 | | String clean; |
90 | | String soundslike; |
91 | | CasePattern case_pattern; |
92 | 8.21k | OriginalWord() {} |
93 | | }; |
94 | | |
95 | | // |
96 | | // struct ScoreWordSound - used for storing the possible words while |
97 | | // they are being processed. |
98 | | // |
99 | | |
100 | | static const char * NO_SOUNDSLIKE = ""; |
101 | | |
102 | | class Working; |
103 | | |
104 | | enum SpecialEdit {None, Split, CamelSplit, CamelJoin, CamelOffByOne}; |
105 | | |
106 | 23.5k | static inline int special_score(const EditDistanceWeights & w, SpecialEdit e) { |
107 | 23.5k | switch (e) { |
108 | 22.5k | case Split: |
109 | 22.5k | return w.max + 2; |
110 | 45 | case CamelJoin: |
111 | 45 | return w.max + 1; |
112 | 786 | case CamelSplit: |
113 | 786 | return w.max + 1; |
114 | 200 | case CamelOffByOne: |
115 | 200 | return w.swap - 1; |
116 | 0 | default: |
117 | 0 | abort(); |
118 | 23.5k | } |
119 | 23.5k | } |
120 | | |
121 | | struct SpecialTypoScore { |
122 | | int score; |
123 | | bool is_overall_score; |
124 | 1.79M | operator bool() const {return score < LARGE_NUM;} |
125 | | SpecialTypoScore() |
126 | 977k | : score(LARGE_NUM), is_overall_score(false) {} |
127 | | SpecialTypoScore(int s, bool q) |
128 | 42.2k | : score(s), is_overall_score(q) {} |
129 | | }; |
130 | | |
131 | 1.01M | static inline SpecialTypoScore special_typo_score(const TypoEditDistanceInfo & w, SpecialEdit e) { |
132 | 1.01M | switch (e) { |
133 | 977k | case None: |
134 | 977k | return SpecialTypoScore(); |
135 | 40.6k | case Split: |
136 | 40.6k | return SpecialTypoScore(w.max + 2, true); |
137 | 1.11k | case CamelSplit: |
138 | 1.11k | return SpecialTypoScore(w.max + 1, true); |
139 | 90 | case CamelJoin: |
140 | 90 | return SpecialTypoScore(w.max + 1, true); |
141 | 400 | case CamelOffByOne: |
142 | 400 | return SpecialTypoScore(w.swap - 1, false); |
143 | 0 | default: |
144 | 0 | abort(); |
145 | 1.01M | } |
146 | 1.01M | } |
147 | | |
148 | | struct ScoreWordSound { |
149 | | Working * src; |
150 | | char * word; |
151 | | char * word_clean; |
152 | | //unsigned word_size; |
153 | | const char * soundslike; |
154 | | int score; |
155 | | int adj_score; |
156 | | int word_score; |
157 | | int soundslike_score; |
158 | | bool count; |
159 | | SpecialEdit special_edit; |
160 | | bool repl_table; |
161 | | WordEntry * repl_list; |
162 | 12.1M | ScoreWordSound(Working * s) : src(s), adj_score(LARGE_NUM), repl_list(0) {} |
163 | 24.2M | ~ScoreWordSound() {delete repl_list;} |
164 | | }; |
165 | | |
166 | | inline int compare (const ScoreWordSound &lhs, |
167 | | const ScoreWordSound &rhs) |
168 | 43.8M | { |
169 | 43.8M | int temp = lhs.score - rhs.score; |
170 | 43.8M | if (temp) return temp; |
171 | 33.0M | return strcmp(lhs.word,rhs.word); |
172 | 43.8M | } |
173 | | |
174 | | inline int adj_score_lt(const ScoreWordSound &lhs, |
175 | | const ScoreWordSound &rhs) |
176 | 18.5M | { |
177 | 18.5M | int temp = lhs.adj_score - rhs.adj_score; |
178 | 18.5M | if (temp) return temp < 0; |
179 | 14.9M | return strcmp(lhs.word,rhs.word) < 0; |
180 | 18.5M | } |
181 | | |
182 | | inline bool operator < (const ScoreWordSound & lhs, |
183 | 43.8M | const ScoreWordSound & rhs) { |
184 | 43.8M | return compare(lhs, rhs) < 0; |
185 | 43.8M | } |
186 | | |
187 | | inline bool operator <= (const ScoreWordSound & lhs, |
188 | 0 | const ScoreWordSound & rhs) { |
189 | 0 | return compare(lhs, rhs) <= 0; |
190 | 0 | } |
191 | | |
192 | | inline bool operator == (const ScoreWordSound & lhs, |
193 | 0 | const ScoreWordSound & rhs) { |
194 | 0 | return compare(lhs, rhs) == 0; |
195 | 0 | } |
196 | | |
197 | | typedef BasicList<ScoreWordSound> NearMisses; |
198 | | |
199 | | class Sugs; |
200 | | |
201 | | class Working { |
202 | | friend class Sugs; |
203 | | |
204 | | const Language * lang; |
205 | | OriginalWord original; |
206 | | const SuggestParms * parms; |
207 | | SpellerImpl * sp; |
208 | | |
209 | | String prefix; |
210 | | String suffix; |
211 | | bool have_presuf; |
212 | | |
213 | | int threshold; |
214 | | int adj_threshold; |
215 | | int try_harder; |
216 | | |
217 | | EditDist (* edit_dist_fun)(const char *, const char *, |
218 | | const EditDistanceWeights &); |
219 | | |
220 | | unsigned int max_word_length; |
221 | | |
222 | | NearMisses scored_near_misses; |
223 | | NearMisses near_misses; |
224 | | |
225 | | char * temp_end; |
226 | | |
227 | | ObjStack buffer; |
228 | | ObjStack temp_buffer; |
229 | | |
230 | | static const bool do_count = true; |
231 | | static const bool dont_count = false; |
232 | | |
233 | | CheckInfo check_info[8]; |
234 | | |
235 | 2.21M | void commit_temp(const char * b) { |
236 | 2.21M | if (temp_end) { |
237 | 900k | buffer.resize_temp(temp_end - b + 1); |
238 | 900k | buffer.commit_temp(); |
239 | 900k | temp_end = 0; }} |
240 | 22.2M | void abort_temp() { |
241 | 22.2M | buffer.abort_temp(); |
242 | 22.2M | temp_end = 0;} |
243 | 0 | const char * to_soundslike_temp(const char * w, unsigned s, unsigned * len = 0) { |
244 | 0 | char * sl = (char *)buffer.alloc_temp(s + 1); |
245 | 0 | temp_end = lang->LangImpl::to_soundslike(sl, w, s); |
246 | 0 | if (len) *len = temp_end - sl; |
247 | 0 | return sl;} |
248 | 1.31M | const char * to_soundslike_temp(const WordEntry & sw) { |
249 | 1.31M | char * sl = (char *)buffer.alloc_temp(sw.word_size + 1); |
250 | 1.31M | temp_end = lang->LangImpl::to_soundslike(sl, sw.word, sw.word_size, sw.word_info); |
251 | 1.31M | if (temp_end == 0) return sw.word; |
252 | 1.31M | else return sl;} |
253 | 277k | const char * to_soundslike(const char * w, unsigned s) { |
254 | 277k | char * sl = (char *)buffer.alloc_temp(s + 1); |
255 | 277k | temp_end = lang->LangImpl::to_soundslike(sl, w, s); |
256 | 277k | commit_temp(sl); |
257 | 277k | return sl;} |
258 | | |
259 | | struct ScoreInfo { |
260 | | const char * soundslike; |
261 | | int word_score; |
262 | | int soundslike_score; |
263 | | bool count; |
264 | | SpecialEdit special_edit; |
265 | | bool repl_table; |
266 | | WordEntry * repl_list; |
267 | | ScoreInfo() |
268 | | : soundslike(), word_score(LARGE_NUM), soundslike_score(LARGE_NUM), |
269 | 15.5M | count(true), special_edit(None), repl_table(false), repl_list() {} |
270 | | }; |
271 | | |
272 | 722k | char * fix_case(char * str) { |
273 | 722k | lang->LangImpl::fix_case(original.case_pattern, str, str); |
274 | 722k | return str; |
275 | 722k | } |
276 | 0 | const char * fix_case(const char * str, String & buf) { |
277 | 0 | return lang->LangImpl::fix_case(original.case_pattern, str, buf); |
278 | 0 | } |
279 | | |
280 | | char * fix_word(ObjStack & buf, ParmStr w); |
281 | | |
282 | | MutableString form_word(CheckInfo & ci); |
283 | | void try_word_n(ParmString str, const ScoreInfo & inf); |
284 | | bool check_word_s(ParmString word, CheckInfo * ci); |
285 | | unsigned check_word(char * word, char * word_end, CheckInfo * ci, |
286 | | /* it WILL modify word */ |
287 | | unsigned pos = 1); |
288 | | void try_word_c(char * word, char * word_end, const ScoreInfo & inf); |
289 | | |
290 | 3.63M | void try_word(char * word, char * word_end, const ScoreInfo & inf) { |
291 | 3.63M | if (sp->unconditional_run_together_) |
292 | 103k | try_word_c(word,word_end,inf); |
293 | 3.52M | else |
294 | 3.52M | try_word_n(word,inf); |
295 | 3.63M | } |
296 | 3.60M | void try_word(char * word, char * word_end, int score) { |
297 | 3.60M | ScoreInfo inf; |
298 | 3.60M | inf.word_score = score; |
299 | 3.60M | try_word(word, word_end, inf); |
300 | 3.60M | } |
301 | | |
302 | | void add_sound(SpellerImpl::WS::const_iterator i, |
303 | | WordEntry * sw, const char * sl, int score = LARGE_NUM); |
304 | | void add_nearmiss(char * word, unsigned word_size, WordInfo word_info, |
305 | | const ScoreInfo &); |
306 | | void add_nearmiss_w(SpellerImpl::WS::const_iterator, const WordEntry & w, |
307 | | const ScoreInfo &); |
308 | | void add_nearmiss_a(SpellerImpl::WS::const_iterator, const WordAff * w, |
309 | | const ScoreInfo &); |
310 | 0 | bool have_score(int score) {return score < LARGE_NUM;} |
311 | 30.4M | int needed_level(int want, int soundslike_score) { |
312 | | // (word_weight*??? + soundlike_weight*soundslike_score)/100 <= want |
313 | | // word_weight*??? + soundlike_weight*soundslike_score <= want*100 |
314 | | // word_weight*??? <= want*100 - soundlike_weight*soundslike_score |
315 | | // ??? <= (want*100 - soundlike_weight*soundslike_score) / word_weight |
316 | | // level = ceil(???/edit_distance_weights.min) |
317 | 30.4M | int n = 100*want - parms->soundslike_weight*soundslike_score; |
318 | 30.4M | if (n <= 0) return 0; |
319 | 28.6M | int d = parms->word_weight*parms->edit_distance_weights.min; |
320 | 28.6M | return (n-1)/d+1; // roundup |
321 | 30.4M | } |
322 | 17.1M | int weighted_average(int soundslike_score, int word_score) { |
323 | 17.1M | return (parms->word_weight*word_score |
324 | 17.1M | + parms->soundslike_weight*soundslike_score)/100; |
325 | 17.1M | } |
326 | 776k | int adj_wighted_average(int soundslike_score, int word_score, int one_edit_max) { |
327 | 776k | int soundslike_weight = parms->soundslike_weight; |
328 | 776k | int word_weight = parms->word_weight; |
329 | 776k | if (word_score <= one_edit_max) { |
330 | 216k | const int factor = word_score < 100 ? 8 : 2; |
331 | 216k | soundslike_weight = (parms->soundslike_weight+factor-1)/factor; |
332 | 216k | } |
333 | | // NOTE: Theoretical if the soundslike is might be beneficial to |
334 | | // adjust the word score so it doesn't contribute as much. If |
335 | | // the score is already around 100 (one edit dist) then it may |
336 | | // not be a good idea to lower it more, but if the word score is |
337 | | // 200 or more then it might make sence to reduce it some. |
338 | | // HOWEVER, this will likely not work well, espacally with small |
339 | | // words and there are just too many words with the same |
340 | | // soundlike. In any case that what the special "soundslike" |
341 | | // and "bad-spellers" mode is for. |
342 | 776k | return (word_weight*word_score |
343 | 776k | + soundslike_weight*soundslike_score)/100; |
344 | 776k | } |
345 | 24.4k | int skip_first_couple(NearMisses::iterator & i) { |
346 | 24.4k | int k = 0; |
347 | 24.4k | InsensitiveCompare cmp(lang); |
348 | 24.4k | const char * prev_word = ""; |
349 | 655k | while (preview_next(i) != scored_near_misses.end()) |
350 | | // skip over the first couple of items as they should |
351 | | // not be counted in the threshold score. |
352 | 650k | { |
353 | 650k | if (!i->count || cmp(prev_word, i->word) == 0) { |
354 | 589k | ++i; |
355 | 589k | } else if (k == parms->skip) { |
356 | 19.6k | break; |
357 | 41.6k | } else { |
358 | 41.6k | prev_word = i->word; |
359 | 41.6k | ++k; |
360 | 41.6k | ++i; |
361 | 41.6k | } |
362 | 650k | } |
363 | 24.4k | return k; |
364 | 24.4k | } |
365 | | |
366 | | void try_camel_word(String & word, SpecialEdit edit); |
367 | | |
368 | | void try_split(); |
369 | | void try_camel_edits(); |
370 | | void try_one_edit_word(); |
371 | | void try_scan(); |
372 | | void try_scan_root(); |
373 | | void try_repl(); |
374 | | void try_ngram(); |
375 | | |
376 | | void score_list(); |
377 | | void fine_tune_score(int thres); |
378 | | public: |
379 | | Working(SpellerImpl * m, const Language *l, |
380 | | const String & w, const SuggestParms * p) |
381 | | : lang(l), original(), parms(p), sp(m), have_presuf(false) |
382 | | , threshold(1), max_word_length(0) |
383 | 8.21k | { |
384 | 8.21k | memset(static_cast<void *>(check_info), 0, sizeof(check_info)); |
385 | 8.21k | original.word = w; |
386 | 8.21k | l->to_lower(original.lower, w.str()); |
387 | 8.21k | l->to_clean(original.clean, w.str()); |
388 | 8.21k | l->to_soundslike(original.soundslike, w.str()); |
389 | 8.21k | original.case_pattern = l->case_pattern(w); |
390 | 8.21k | camel_case = parms->camel_case; |
391 | 8.21k | } |
392 | 684 | void with_presuf(ParmStr pre, ParmStr suf) { |
393 | 684 | prefix = pre; |
394 | 684 | suffix = suf; |
395 | 684 | have_presuf = true; |
396 | 684 | } |
397 | | bool camel_case; |
398 | | // `this` is expected to be allocated with new and its ownership |
399 | | // will be transferred to the returning Sugs object |
400 | | Sugs * suggestions(); |
401 | | }; |
402 | | |
403 | | struct Suggestion { |
404 | | const char * word; |
405 | | const ScoreWordSound * inf; |
406 | 0 | double distance() const { |
407 | 0 | return inf->adj_score/100.0; |
408 | 0 | } |
409 | 0 | double normalized_score() const { |
410 | 0 | return 100.0/(inf->adj_score + 100); |
411 | 0 | } |
412 | 0 | Suggestion() : word(), inf() {} |
413 | | Suggestion(const char * word, const ScoreWordSound * inf) |
414 | 498k | : word(word), inf(inf) {} |
415 | | }; |
416 | | |
417 | | struct SavedBufs : public Vector<ObjStack::Memory *> { |
418 | 8.96k | void reset() { |
419 | 8.96k | for (Vector<ObjStack::Memory *>::iterator i = begin(), e = end(); |
420 | 17.1k | i != e; ++i) |
421 | 8.21k | ObjStack::dealloc(*i); |
422 | 8.96k | clear(); |
423 | 8.96k | } |
424 | 1.43k | ~SavedBufs() { |
425 | 1.43k | reset(); |
426 | 1.43k | } |
427 | | }; |
428 | | |
429 | | class SuggestionsImpl; |
430 | | |
431 | | class Sugs { |
432 | | public: |
433 | | Vector<Working *> srcs; |
434 | | NearMisses scored_near_misses; |
435 | | |
436 | 684 | void merge(Sugs & other) { |
437 | 684 | srcs.insert(srcs.end(), other.srcs.begin(), other.srcs.end()); |
438 | 684 | other.srcs.clear(); |
439 | 684 | scored_near_misses.merge(other.scored_near_misses, adj_score_lt); |
440 | 684 | } |
441 | | |
442 | | void transfer(SuggestionsImpl &, int limit); |
443 | | |
444 | 8.21k | Sugs(Working * s) { |
445 | 8.21k | srcs.push_back(s); |
446 | 8.21k | } |
447 | 8.21k | ~Sugs() { |
448 | 16.4k | for (Vector<Working *>::iterator i = srcs.begin(), e = srcs.end(); i != e; ++i) { |
449 | 8.21k | delete *i; |
450 | 8.21k | *i = NULL; |
451 | 8.21k | } |
452 | 8.21k | } |
453 | | }; |
454 | | |
455 | | class SuggestionsImpl : public SuggestionsData, public Vector<Suggestion> { |
456 | | public: |
457 | | SavedBufs saved_bufs_; |
458 | | NearMisses saved_near_misses_; |
459 | | ObjStack buf; |
460 | 1.43k | SuggestionsImpl() {} |
461 | | private: |
462 | | SuggestionsImpl(const SuggestionsImpl &); |
463 | | public: |
464 | 7.53k | void reset() { |
465 | 7.53k | clear(); |
466 | 7.53k | buf.reset(); |
467 | 7.53k | saved_bufs_.reset(); |
468 | 7.53k | saved_near_misses_.clear(); |
469 | 7.53k | } |
470 | 0 | void get_words(Convert * conv, Vector<CharVector> & res) { |
471 | 0 | res.clear(); |
472 | 0 | res.reserve(size()); |
473 | 0 | if (conv) { |
474 | 0 | for (iterator i = begin(), e = end(); i != e; ++i) { |
475 | 0 | res.push_back(CharVector()); |
476 | | // len + 1 to also convert the null |
477 | 0 | conv->convert(i->word, strlen(i->word) + 1, res.back()); |
478 | 0 | } |
479 | 0 | } else { |
480 | 0 | for (iterator i = begin(), e = end(); i != e; ++i) { |
481 | 0 | res.push_back(CharVector()); |
482 | 0 | res.reserve(strlen(i->word) + 1); |
483 | 0 | res.back().append(i->word); |
484 | 0 | res.back().append('\0'); |
485 | 0 | } |
486 | 0 | } |
487 | 0 | } |
488 | 0 | void get_normalized_scores(Vector<double> & res) { |
489 | 0 | res.clear(); |
490 | 0 | res.reserve(size()); |
491 | 0 | for (iterator i = begin(), e = end(); i != e; ++i) |
492 | 0 | res.push_back(i->normalized_score()); |
493 | 0 | } |
494 | 0 | void get_distances(Vector<double> & res) { |
495 | 0 | res.clear(); |
496 | 0 | res.reserve(size()); |
497 | 0 | for (iterator i = begin(), e = end(); i != e; ++i) |
498 | 0 | res.push_back(i->distance()); |
499 | 0 | } |
500 | | }; |
501 | | |
502 | 8.21k | Sugs * Working::suggestions() { |
503 | | |
504 | 8.21k | Sugs * sug = new Sugs(this); |
505 | | |
506 | 8.21k | if (original.word.size() * parms->edit_distance_weights.max >= 0x8000) |
507 | 370 | return sug; // to prevent overflow in the editdist functions |
508 | | |
509 | 7.84k | try_split(); |
510 | | |
511 | 7.84k | try_camel_edits(); |
512 | | |
513 | 7.84k | if (parms->use_repl_table) { |
514 | | |
515 | | #ifdef DEBUG_SUGGEST |
516 | | COUT.printl("TRYING REPLACEMENT TABLE"); |
517 | | #endif |
518 | | |
519 | 740 | try_repl(); |
520 | 740 | } |
521 | | |
522 | 7.84k | if (parms->try_one_edit_word) { |
523 | | |
524 | | #ifdef DEBUG_SUGGEST |
525 | | COUT.printl("TRYING ONE EDIT WORD"); |
526 | | #endif |
527 | | |
528 | 7.84k | try_one_edit_word(); |
529 | 7.84k | score_list(); |
530 | 7.84k | if (parms->check_after_one_edit_word) { |
531 | 0 | if (try_harder <= 0) goto done; |
532 | 0 | } |
533 | | // need to fine tune the score to account for special weights |
534 | | // applied to typos, otherwise some typos that produce very |
535 | | // different soundslike may be missed |
536 | 7.84k | fine_tune_score(LARGE_NUM); |
537 | 7.84k | } |
538 | | |
539 | 7.84k | if (parms->try_scan_0) { |
540 | | |
541 | | #ifdef DEBUG_SUGGEST |
542 | | COUT.printl("TRYING SCAN 0"); |
543 | | #endif |
544 | 372 | edit_dist_fun = limit0_edit_distance; |
545 | | |
546 | 372 | if (sp->soundslike_root_only) |
547 | 0 | try_scan_root(); |
548 | 372 | else |
549 | 372 | try_scan(); |
550 | | |
551 | 372 | score_list(); |
552 | | |
553 | 372 | } |
554 | | |
555 | 7.84k | if (parms->try_scan_1) { |
556 | | |
557 | | #ifdef DEBUG_SUGGEST |
558 | | COUT.printl("TRYING SCAN 1"); |
559 | | #endif |
560 | 7.37k | edit_dist_fun = limit1_edit_distance; |
561 | | |
562 | 7.37k | if (sp->soundslike_root_only) |
563 | 740 | try_scan_root(); |
564 | 6.63k | else |
565 | 6.63k | try_scan(); |
566 | | |
567 | 7.37k | score_list(); |
568 | | |
569 | 7.37k | if (try_harder <= 0) goto done; |
570 | | |
571 | 7.37k | } |
572 | | |
573 | 1.82k | if (parms->try_scan_2) { |
574 | | |
575 | | #ifdef DEBUG_SUGGEST |
576 | | COUT.printl("TRYING SCAN 2"); |
577 | | #endif |
578 | | |
579 | 1.44k | edit_dist_fun = limit2_edit_distance; |
580 | | |
581 | 1.44k | if (sp->soundslike_root_only) |
582 | 7 | try_scan_root(); |
583 | 1.44k | else |
584 | 1.44k | try_scan(); |
585 | | |
586 | 1.44k | score_list(); |
587 | | |
588 | 1.44k | if (try_harder < parms->ngram_threshold) goto done; |
589 | | |
590 | 1.44k | } |
591 | | |
592 | 992 | if (parms->try_ngram) { |
593 | | |
594 | | #ifdef DEBUG_SUGGEST |
595 | | COUT.printl("TRYING NGRAM"); |
596 | | #endif |
597 | | |
598 | 89 | try_ngram(); |
599 | | |
600 | 89 | score_list(); |
601 | | |
602 | 89 | } |
603 | | |
604 | 7.84k | done: |
605 | | |
606 | 7.84k | fine_tune_score(threshold); |
607 | 7.84k | scored_near_misses.sort(adj_score_lt); |
608 | 7.84k | sug->scored_near_misses.swap(scored_near_misses); |
609 | 7.84k | near_misses.clear(); |
610 | 7.84k | return sug; |
611 | 992 | } |
612 | | |
613 | | // Forms a word by combining CheckInfo fields. |
614 | | // Will grow the grow the temp in the buffer. The final |
615 | | // word must be null terminated and committed. |
616 | | // It returns a MutableString of what was appended to the buffer. |
617 | | MutableString Working::form_word(CheckInfo & ci) |
618 | 110k | { |
619 | 110k | size_t slen = ci.word.len - ci.pre_strip_len - ci.suf_strip_len; |
620 | 110k | size_t wlen = slen + ci.pre_add_len + ci.suf_add_len; |
621 | 110k | char * tmp = (char *)buffer.grow_temp(wlen); |
622 | 110k | if (ci.pre_add_len) |
623 | 1 | memcpy(tmp, ci.pre_add, ci.pre_add_len); |
624 | 110k | memcpy(tmp + ci.pre_add_len, ci.word.str + ci.pre_strip_len, slen); |
625 | 110k | if (ci.suf_add_len) |
626 | 714 | memcpy(tmp + ci.pre_add_len + slen, ci.suf_add, ci.suf_add_len); |
627 | 110k | return MutableString(tmp,wlen); |
628 | 110k | } |
629 | | |
630 | | void Working::try_word_n(ParmString str, const ScoreInfo & inf) |
631 | 3.52M | { |
632 | 3.52M | String word; |
633 | 3.52M | String buf; |
634 | 3.52M | WordEntry sw; |
635 | 3.52M | for (SpellerImpl::WS::const_iterator i = sp->suggest_ws.begin(); |
636 | 20.4M | i != sp->suggest_ws.end(); |
637 | 16.9M | ++i) |
638 | 16.9M | { |
639 | 16.9M | (*i)->clean_lookup(str, sw); |
640 | 17.1M | for (;!sw.at_end(); sw.adv()) |
641 | 165k | add_nearmiss_w(i, sw, inf); |
642 | 16.9M | } |
643 | 3.52M | if (sp->affix_compress) { |
644 | 678k | CheckInfo ci; memset(static_cast<void *>(&ci), 0, sizeof(ci)); |
645 | 678k | bool res = lang->affix()->affix_check(LookupInfo(sp, LookupInfo::Clean), str, ci, 0); |
646 | 678k | if (!res) return; |
647 | 919 | form_word(ci); |
648 | 919 | char * end = (char *)buffer.grow_temp(1); |
649 | 919 | char * tmp = (char *)buffer.temp_ptr(); |
650 | 919 | buffer.commit_temp(); |
651 | 919 | *end = '\0'; |
652 | 919 | add_nearmiss(tmp, end - tmp, 0, inf); |
653 | 919 | } |
654 | 3.52M | } |
655 | | |
656 | | bool Working::check_word_s(ParmString word, CheckInfo * ci) |
657 | 103k | { |
658 | 103k | WordEntry sw; |
659 | 103k | for (SpellerImpl::WS::const_iterator i = sp->suggest_ws.begin(); |
660 | 437k | i != sp->suggest_ws.end(); |
661 | 334k | ++i) |
662 | 370k | { |
663 | 370k | (*i)->clean_lookup(word, sw); |
664 | 370k | if (!sw.at_end()) { |
665 | 36.6k | ci->word = sw.word; |
666 | 36.6k | return true; |
667 | 36.6k | } |
668 | 370k | } |
669 | 66.8k | if (sp->affix_compress) { |
670 | 0 | return lang->affix()->affix_check(LookupInfo(sp, LookupInfo::Clean), word, *ci, 0); |
671 | 0 | } |
672 | 66.8k | return false; |
673 | 66.8k | } |
674 | | |
675 | | unsigned Working::check_word(char * word, char * word_end, CheckInfo * ci, |
676 | | /* it WILL modify word */ |
677 | | unsigned pos) |
678 | 103k | { |
679 | 103k | unsigned res = check_word_s(word, ci); |
680 | 103k | if (res) return pos + 1; |
681 | 66.8k | if (pos + 1 >= sp->run_together_limit_) return 0; |
682 | 0 | for (char * i = word + sp->run_together_min_; |
683 | 0 | i <= word_end - sp->run_together_min_; |
684 | 0 | ++i) |
685 | 0 | { |
686 | 0 | char t = *i; |
687 | 0 | *i = '\0'; |
688 | 0 | res = check_word_s(word, ci); |
689 | 0 | *i = t; |
690 | 0 | if (!res) continue; |
691 | 0 | res = check_word(i, word_end, ci + 1, pos + 1); |
692 | 0 | if (res) return res; |
693 | 0 | } |
694 | 0 | memset(static_cast<void *>(ci), 0, sizeof(CheckInfo)); |
695 | 0 | return 0; |
696 | 0 | } |
697 | | |
698 | | void Working::try_word_c(char * word, char * word_end, const ScoreInfo & inf) |
699 | 103k | { |
700 | 103k | unsigned res = check_word(word, word_end, check_info); |
701 | 103k | assert(res <= sp->run_together_limit_); |
702 | | //CERR.printf(">%s\n", word); |
703 | 103k | if (!res) return; |
704 | 36.6k | buffer.abort_temp(); |
705 | 36.6k | MutableString tmp = form_word(check_info[0]); |
706 | 36.6k | CasePattern cp = lang->case_pattern(tmp, tmp.size); |
707 | 109k | for (unsigned i = 1; i <= res; ++i) { |
708 | 73.2k | char * t = form_word(check_info[i]); |
709 | 73.2k | if (cp == FirstUpper && lang->is_lower(t[1])) |
710 | 4.37k | t[0] = lang->to_lower(t[0]); |
711 | 73.2k | } |
712 | 36.6k | char * end = (char *)buffer.grow_temp(1); |
713 | 36.6k | char * beg = (char *)buffer.temp_ptr(); // since the original string may of moved |
714 | 36.6k | *end = 0; |
715 | 36.6k | buffer.commit_temp(); |
716 | 36.6k | add_nearmiss(beg, end - beg, 0, inf); |
717 | | //CERR.printl(tmp); |
718 | 36.6k | memset(static_cast<void *>(check_info), 0, sizeof(CheckInfo)*res); |
719 | 36.6k | } |
720 | | |
721 | | void Working::add_nearmiss(char * word, unsigned word_size, |
722 | | WordInfo word_info, |
723 | | const ScoreInfo & inf) |
724 | 12.0M | { |
725 | 12.0M | if (word_size * parms->edit_distance_weights.max >= 0x8000) |
726 | 204 | return; // to prevent overflow in the editdist functions |
727 | | |
728 | 12.0M | near_misses.push_front(ScoreWordSound(this)); |
729 | 12.0M | ScoreWordSound & d = near_misses.front(); |
730 | 12.0M | d.word = word; |
731 | 12.0M | d.soundslike = inf.soundslike; |
732 | | |
733 | 12.0M | d.word_score = inf.word_score; |
734 | 12.0M | d.soundslike_score = inf.soundslike_score; |
735 | | |
736 | 12.0M | if (!sp->have_soundslike) { |
737 | 0 | if (d.word_score >= LARGE_NUM) d.word_score = d.soundslike_score; |
738 | 0 | else if (d.soundslike_score >= LARGE_NUM) d.soundslike_score = d.word_score; |
739 | 0 | } |
740 | | |
741 | 12.0M | unsigned int l = word_size; |
742 | 12.0M | if (l > max_word_length) max_word_length = l; |
743 | | |
744 | 12.0M | if (!(word_info & ALL_CLEAN)) { |
745 | 6.63M | d.word_clean = (char *)buffer.alloc(word_size + 1); |
746 | 6.63M | lang->LangImpl::to_clean((char *)d.word_clean, word); |
747 | 6.63M | } else { |
748 | 5.45M | d.word_clean = d.word; |
749 | 5.45M | } |
750 | | |
751 | 12.0M | if (!sp->have_soundslike && !d.soundslike) |
752 | 0 | d.soundslike = d.word_clean; |
753 | | |
754 | 12.0M | d.special_edit = inf.special_edit; |
755 | 12.0M | d.repl_table = inf.repl_table; |
756 | 12.0M | d.count = inf.count; |
757 | 12.0M | d.repl_list = inf.repl_list; |
758 | 12.0M | } |
759 | | |
760 | | void Working::add_nearmiss_w(SpellerImpl::WS::const_iterator i, |
761 | | const WordEntry & w, const ScoreInfo & inf0) |
762 | 9.31M | { |
763 | 9.31M | assert(w.word_size == strlen(w.word)); |
764 | 9.31M | ScoreInfo inf = inf0; |
765 | 9.31M | if (w.what == WordEntry::Misspelled) { |
766 | 0 | inf.repl_list = new WordEntry; |
767 | 0 | const ReplacementDict * repl_dict |
768 | 0 | = static_cast<const ReplacementDict *>(*i); |
769 | 0 | repl_dict->repl_lookup(w, *inf.repl_list); |
770 | 0 | } |
771 | 9.31M | add_nearmiss(buffer.dup(ParmString(w.word, w.word_size)), |
772 | 9.31M | w.word_size, w.word_info, inf); |
773 | 9.31M | } |
774 | | |
775 | | void Working::add_nearmiss_a(SpellerImpl::WS::const_iterator i, |
776 | | const WordAff * w, const ScoreInfo & inf) |
777 | 2.72M | { |
778 | 2.72M | add_nearmiss(buffer.dup(w->word), w->word.size, 0, inf); |
779 | 2.72M | } |
780 | | |
781 | 7.84k | void Working::try_split() { |
782 | 7.84k | const String & word = original.word; |
783 | | |
784 | 7.84k | if (word.size() < 4 || parms->split_chars.empty()) return; |
785 | 2.91k | size_t i = 0; |
786 | | |
787 | 2.91k | String new_word_str; |
788 | 2.91k | String buf; |
789 | 2.91k | new_word_str.resize(word.size() + 1); |
790 | 2.91k | char * new_word = new_word_str.data(); |
791 | 2.91k | memcpy(new_word, word.data(), word.size()); |
792 | 2.91k | new_word[word.size() + 1] = '\0'; |
793 | 2.91k | new_word[word.size() + 0] = new_word[word.size() - 1]; |
794 | | |
795 | 43.2k | for (i = word.size() - 2; i >= 2; --i) { |
796 | 40.2k | new_word[i+1] = new_word[i]; |
797 | 40.2k | new_word[i] = '\0'; |
798 | | |
799 | 40.2k | if (sp->check(new_word) && sp->check(new_word + i + 1)) { |
800 | 33.1k | for (size_t j = 0; j != parms->split_chars.size(); ++j) |
801 | 22.5k | { |
802 | 22.5k | new_word[i] = parms->split_chars[j]; |
803 | 22.5k | ScoreInfo inf; |
804 | 22.5k | inf.word_score = special_score(parms->edit_distance_weights, Split); |
805 | 22.5k | inf.soundslike_score = inf.word_score; |
806 | 22.5k | inf.soundslike = NO_SOUNDSLIKE; |
807 | 22.5k | inf.count = false; |
808 | 22.5k | inf.special_edit = Split; |
809 | 22.5k | add_nearmiss(buffer.dup(new_word), word.size() + 1, 0, inf); |
810 | 22.5k | } |
811 | 10.5k | } |
812 | 40.2k | } |
813 | 2.91k | } |
814 | | |
815 | 15.9k | void Working::try_camel_word(String & word, SpecialEdit edit) { |
816 | 15.9k | CheckInfo ci[8]; |
817 | 15.9k | bool ok = sp->check(word.begin(), word.end(), false, sp->run_together_limit(), ci, ci + 8, NULL, NULL); |
818 | 15.9k | if (!ok) return; |
819 | 1.03k | ScoreInfo inf; |
820 | 1.03k | inf.word_score = special_score(parms->edit_distance_weights, edit); |
821 | 1.03k | inf.soundslike_score = inf.word_score; |
822 | 1.03k | inf.soundslike = NO_SOUNDSLIKE; |
823 | 1.03k | inf.count = false; |
824 | 1.03k | inf.special_edit = edit; |
825 | 1.03k | add_nearmiss(buffer.dup(word.c_str()), word.size() + 1, 0, inf); |
826 | 1.03k | } |
827 | | |
828 | 7.84k | void Working::try_camel_edits() { |
829 | 7.84k | if (!camel_case) return; |
830 | | |
831 | 1.37k | String word = original.word; |
832 | 1.37k | word.ensure_null_end(); |
833 | | |
834 | 12.5k | for (size_t i = 1; i < word.size(); ++i) { |
835 | | // try splitting or joining a word by changing the case of a letter |
836 | 11.1k | SpecialEdit edit = None; |
837 | 11.1k | char save = word[i]; |
838 | 11.1k | word[i] = lang->to_upper(word[i]); |
839 | 11.1k | if (word[i] != save) { |
840 | 4.38k | edit = CamelSplit; |
841 | 6.81k | } else { |
842 | 6.81k | word[i] = lang->to_lower(word[i]); |
843 | 6.81k | if (word[i] != save) |
844 | 5.54k | edit = CamelJoin; |
845 | 6.81k | } |
846 | 11.1k | try_camel_word(word, edit); |
847 | | |
848 | | //if the char was made lower now also try making an adjacent character uppercase |
849 | 11.1k | if (edit == CamelJoin) { |
850 | 5.54k | char save2 = word[i-1]; |
851 | 5.54k | word[i-1] = lang->to_upper(word[i-1]); |
852 | 5.54k | if (word[i-1] != save2) |
853 | 2.72k | try_camel_word(word, CamelOffByOne); |
854 | 5.54k | word[i-1] = save2; |
855 | 5.54k | if (i+1 < word.size()) { |
856 | 4.68k | save2 = word[i+1]; |
857 | 4.68k | word[i+1] = lang->to_upper(word[i+1]); |
858 | 4.68k | if (word[i+1] != save2) |
859 | 1.99k | try_camel_word(word, CamelOffByOne); |
860 | 4.68k | word[i+1] = save2; |
861 | 4.68k | } |
862 | 5.54k | } |
863 | | |
864 | 11.1k | word[i] = save; |
865 | 11.1k | } |
866 | 1.37k | } |
867 | | |
868 | | void Working::try_one_edit_word() |
869 | 7.84k | { |
870 | 7.84k | const String & orig = original.clean; |
871 | 7.84k | const char * replace_list = lang->clean_chars(); |
872 | 7.84k | char a,b; |
873 | 7.84k | const char * c; |
874 | 7.84k | VARARRAY(char, new_word, orig.size() + 2); |
875 | 7.84k | char * new_word_end = new_word + orig.size(); |
876 | 7.84k | size_t i; |
877 | | |
878 | 7.84k | memcpy(new_word, orig.str(), orig.size() + 1); |
879 | | |
880 | | // Try word as is (in case of case difference etc) |
881 | | |
882 | 7.84k | try_word(new_word, new_word_end, 0); |
883 | | |
884 | | // Change one letter |
885 | | |
886 | 56.4k | for (i = 0; i != orig.size(); ++i) { |
887 | 1.69M | for (c = replace_list; *c; ++c) { |
888 | 1.64M | if (*c == orig[i]) continue; |
889 | 1.59M | new_word[i] = *c; |
890 | 1.59M | try_word(new_word, new_word_end, parms->edit_distance_weights.sub); |
891 | 1.59M | } |
892 | 48.5k | new_word[i] = orig[i]; |
893 | 48.5k | } |
894 | | |
895 | | // Interchange two adjacent letters. |
896 | | |
897 | 50.2k | for (i = 0; i+1 < orig.size(); ++i) { |
898 | 42.4k | a = new_word[i]; |
899 | 42.4k | b = new_word[i+1]; |
900 | 42.4k | new_word[i] = b; |
901 | 42.4k | new_word[i+1] = a; |
902 | 42.4k | try_word(new_word, new_word_end, parms->edit_distance_weights.swap); |
903 | 42.4k | new_word[i] = a; |
904 | 42.4k | new_word[i+1] = b; |
905 | 42.4k | } |
906 | | |
907 | | // Add one letter |
908 | | |
909 | 7.84k | *new_word_end = ' '; |
910 | 7.84k | new_word_end++; |
911 | 7.84k | *new_word_end = '\0'; |
912 | 7.84k | i = new_word_end - new_word - 1; |
913 | 56.4k | while(true) { |
914 | 1.96M | for (c=replace_list; *c; ++c) { |
915 | 1.90M | new_word[i] = *c; |
916 | 1.90M | try_word(new_word, new_word_end, parms->edit_distance_weights.del1); |
917 | 1.90M | } |
918 | 56.4k | if (i == 0) break; |
919 | 48.5k | new_word[i] = new_word[i-1]; |
920 | 48.5k | --i; |
921 | 48.5k | } |
922 | | |
923 | | // Delete one letter |
924 | | |
925 | 7.84k | if (orig.size() > 1) { |
926 | 6.02k | memcpy(new_word, orig.str(), orig.size() + 1); |
927 | 6.02k | new_word_end = new_word + orig.size() - 1; |
928 | 6.02k | a = *new_word_end; |
929 | 6.02k | *new_word_end = '\0'; |
930 | 6.02k | i = orig.size() - 1; |
931 | 48.4k | while (true) { |
932 | 48.4k | try_word(new_word, new_word_end, parms->edit_distance_weights.del2); |
933 | 48.4k | if (i == 0) break; |
934 | 42.4k | b = a; |
935 | 42.4k | a = new_word[i-1]; |
936 | 42.4k | new_word[i-1] = b; |
937 | 42.4k | --i; |
938 | 42.4k | } |
939 | 6.02k | } |
940 | 7.84k | } |
941 | | |
942 | | void Working::add_sound(SpellerImpl::WS::const_iterator i, |
943 | | WordEntry * sw, const char * sl, int score) |
944 | 1.93M | { |
945 | 1.93M | WordEntry w; |
946 | 1.93M | (*i)->soundslike_lookup(*sw, w); |
947 | | |
948 | 11.0M | for (; !w.at_end(); w.adv()) { |
949 | | |
950 | 9.14M | ScoreInfo inf; |
951 | 9.14M | inf.soundslike = sl; |
952 | 9.14M | inf.soundslike_score = score; |
953 | 9.14M | add_nearmiss_w(i, w, inf); |
954 | | |
955 | 9.14M | if (w.aff[0]) { |
956 | 442k | String sl_buf; |
957 | 442k | temp_buffer.reset(); |
958 | 442k | WordAff * exp_list; |
959 | 442k | exp_list = lang->affix()->expand(w.word, w.aff, temp_buffer); |
960 | 3.16M | for (WordAff * p = exp_list->next; p; p = p->next) { |
961 | 2.72M | add_nearmiss_a(i, p, ScoreInfo()); |
962 | 2.72M | } |
963 | 442k | } |
964 | | |
965 | 9.14M | } |
966 | 1.93M | } |
967 | | |
968 | | void Working::try_scan() |
969 | 8.44k | { |
970 | 8.44k | const char * original_soundslike = original.soundslike.str(); |
971 | | |
972 | 8.44k | WordEntry * sw; |
973 | 8.44k | WordEntry w; |
974 | 8.44k | const char * sl = 0; |
975 | 8.44k | EditDist score; |
976 | 8.44k | unsigned int stopped_at = LARGE_NUM; |
977 | 8.44k | WordAff * exp_list; |
978 | 8.44k | WordAff single; |
979 | 8.44k | single.next = 0; |
980 | | |
981 | 8.44k | for (SpellerImpl::WS::const_iterator i = sp->suggest_ws.begin(); |
982 | 50.6k | i != sp->suggest_ws.end(); |
983 | 42.2k | ++i) |
984 | 42.2k | { |
985 | | //CERR.printf(">>%p %s\n", *i, typeid(**i).name()); |
986 | 42.2k | StackPtr<SoundslikeEnumeration> els((*i)->soundslike_elements()); |
987 | | |
988 | 17.0M | while ( (sw = els->next(stopped_at)) ) { |
989 | | |
990 | | //CERR.printf("[%s (%d) %d]\n", sw->word, sw->word_size, sw->what); |
991 | | //assert(strlen(sw->word) == sw->word_size); |
992 | | |
993 | 17.0M | if (sw->what != WordEntry::Word) { |
994 | 17.0M | sl = sw->word; |
995 | 17.0M | abort_temp(); |
996 | 17.0M | } else if (!*sw->aff) { |
997 | 0 | sl = to_soundslike_temp(*sw); |
998 | 0 | } else { |
999 | 0 | goto affix_case; |
1000 | 0 | } |
1001 | | |
1002 | | //CERR.printf("SL = %s\n", sl); |
1003 | | |
1004 | 17.0M | score = edit_dist_fun(sl, original_soundslike, parms->edit_distance_weights); |
1005 | 17.0M | stopped_at = score.stopped_at - sl; |
1006 | 17.0M | if (score >= LARGE_NUM) continue; |
1007 | 1.25M | stopped_at = LARGE_NUM; |
1008 | 1.25M | commit_temp(sl); |
1009 | 1.25M | add_sound(i, sw, sl, score); |
1010 | 1.25M | continue; |
1011 | | |
1012 | 0 | affix_case: |
1013 | | |
1014 | 0 | temp_buffer.reset(); |
1015 | | |
1016 | | // first expand any prefixes |
1017 | 0 | if (sp->fast_scan) { // if fast_scan, then no prefixes |
1018 | 0 | single.word.str = sw->word; |
1019 | 0 | single.word.size = strlen(sw->word); |
1020 | 0 | single.aff = (const unsigned char *)sw->aff; |
1021 | 0 | exp_list = &single; |
1022 | 0 | } else { |
1023 | 0 | exp_list = lang->affix()->expand_prefix(sw->word, sw->aff, temp_buffer); |
1024 | 0 | } |
1025 | | |
1026 | | // iterate through each semi-expanded word, any affix flags |
1027 | | // are now guaranteed to be suffixes |
1028 | 0 | for (WordAff * p = exp_list; p; p = p->next) |
1029 | 0 | { |
1030 | | // try the root word |
1031 | 0 | unsigned sl_len; |
1032 | 0 | sl = to_soundslike_temp(p->word.str, p->word.size, &sl_len); |
1033 | 0 | score = edit_dist_fun(sl, original_soundslike, parms->edit_distance_weights); |
1034 | 0 | stopped_at = score.stopped_at - sl; |
1035 | 0 | stopped_at += p->word.size - sl_len; |
1036 | | |
1037 | 0 | if (score < LARGE_NUM) { |
1038 | 0 | commit_temp(sl); |
1039 | 0 | ScoreInfo inf; |
1040 | 0 | inf.soundslike = sl; |
1041 | 0 | inf.soundslike_score = score; |
1042 | 0 | add_nearmiss_a(i, p, inf); |
1043 | 0 | } |
1044 | | |
1045 | | // expand any suffixes, using stopped_at as a hint to avoid |
1046 | | // unneeded expansions. Note stopped_at is the last character |
1047 | | // looked at by limit_edit_dist. Thus if the character |
1048 | | // at stopped_at is changed it might effect the result |
1049 | | // hence the "limit" is stopped_at + 1 |
1050 | 0 | if (p->word.size - lang->affix()->max_strip() > stopped_at) |
1051 | 0 | exp_list = 0; |
1052 | 0 | else |
1053 | 0 | exp_list = lang->affix()->expand_suffix(p->word, p->aff, |
1054 | 0 | temp_buffer, |
1055 | 0 | stopped_at + 1); |
1056 | | |
1057 | | // reset stopped_at if necessary |
1058 | 0 | if (score < LARGE_NUM) stopped_at = LARGE_NUM; |
1059 | | |
1060 | | // iterate through fully expanded words, if any |
1061 | 0 | for (WordAff * q = exp_list; q; q = q->next) { |
1062 | 0 | sl = to_soundslike_temp(q->word.str, q->word.size); |
1063 | 0 | score = edit_dist_fun(sl, original_soundslike, parms->edit_distance_weights); |
1064 | 0 | if (score >= LARGE_NUM) continue; |
1065 | 0 | commit_temp(sl); |
1066 | 0 | ScoreInfo inf; |
1067 | 0 | inf.soundslike = sl; |
1068 | 0 | inf.soundslike_score = score; |
1069 | 0 | add_nearmiss_a(i, q, inf); |
1070 | 0 | } |
1071 | 0 | } |
1072 | 0 | } |
1073 | 42.2k | } |
1074 | 8.44k | } |
1075 | | |
1076 | | void Working::try_scan_root() |
1077 | 747 | { |
1078 | | |
1079 | 747 | WordEntry * sw; |
1080 | 747 | WordEntry w; |
1081 | 747 | const char * sl = 0; |
1082 | 747 | EditDist score; |
1083 | 747 | int stopped_at = LARGE_NUM; |
1084 | 747 | GuessInfo gi; |
1085 | 747 | lang->munch(original.word, &gi); |
1086 | 747 | Vector<const char *> sls; |
1087 | 747 | sls.push_back(original.soundslike.str()); |
1088 | | #ifdef DEBUG_SUGGEST |
1089 | | COUT.printf("will try soundslike: %s\n", sls.back()); |
1090 | | #endif |
1091 | 747 | for (const aspeller::CheckInfo * ci = gi.head; |
1092 | 856 | ci; |
1093 | 747 | ci = ci->next) |
1094 | 109 | { |
1095 | 109 | sl = to_soundslike(ci->word.str, ci->word.len); |
1096 | 109 | Vector<const char *>::iterator i = sls.begin(); |
1097 | 208 | while (i != sls.end() && strcmp(*i, sl) != 0) ++i; |
1098 | 109 | if (i == sls.end()) { |
1099 | 67 | sls.push_back(to_soundslike(ci->word.str, ci->word.len)); |
1100 | | #ifdef DEBUG_SUGGEST |
1101 | | COUT.printf("will try root soundslike: %s\n", sls.back()); |
1102 | | #endif |
1103 | 67 | } |
1104 | 109 | } |
1105 | 747 | const char * * begin = sls.pbegin(); |
1106 | 747 | const char * * end = sls.pend(); |
1107 | 747 | for (SpellerImpl::WS::const_iterator i = sp->suggest_ws.begin(); |
1108 | 3.73k | i != sp->suggest_ws.end(); |
1109 | 2.98k | ++i) |
1110 | 2.98k | { |
1111 | 2.98k | StackPtr<SoundslikeEnumeration> els((*i)->soundslike_elements()); |
1112 | | |
1113 | 2.00M | while ( (sw = els->next(stopped_at)) ) { |
1114 | | |
1115 | 2.00M | if (sw->what != WordEntry::Word) { |
1116 | 684k | sl = sw->word; |
1117 | 684k | abort_temp(); |
1118 | 1.31M | } else { |
1119 | 1.31M | sl = to_soundslike_temp(*sw); |
1120 | 1.31M | } |
1121 | | |
1122 | 2.00M | stopped_at = LARGE_NUM; |
1123 | 3.85M | for (const char * * s = begin; s != end; ++s) { |
1124 | 2.54M | score = edit_dist_fun(sl, *s, |
1125 | 2.54M | parms->edit_distance_weights); |
1126 | 2.54M | if (score.stopped_at - sl < stopped_at) |
1127 | 2.20M | stopped_at = score.stopped_at - sl; |
1128 | 2.54M | if (score >= LARGE_NUM) continue; |
1129 | 685k | stopped_at = LARGE_NUM; |
1130 | 685k | commit_temp(sl); |
1131 | 685k | add_sound(i, sw, sl, score); |
1132 | | //CERR.printf("using %s: will add %s with score %d\n", *s, sl, (int)score); |
1133 | 685k | break; |
1134 | 2.54M | } |
1135 | 2.00M | } |
1136 | 2.98k | } |
1137 | 747 | } |
1138 | | |
1139 | | struct ReplTry |
1140 | | { |
1141 | | const char * begin; |
1142 | | const char * end; |
1143 | | const char * repl; |
1144 | | size_t repl_len; |
1145 | | ReplTry(const char * b, const char * e, const char * r) |
1146 | 0 | : begin(b), end(e), repl(r), repl_len(strlen(r)) {} |
1147 | | }; |
1148 | | |
1149 | | void Working::try_repl() |
1150 | 740 | { |
1151 | 740 | String buf; |
1152 | 740 | Vector<ReplTry> repl_try; |
1153 | 740 | StackPtr<SuggestReplEnumeration> els(lang->repl()); |
1154 | 740 | const SuggestRepl * r = 0; |
1155 | 740 | const char * word = original.clean.str(); |
1156 | 740 | const char * wend = word + original.clean.size(); |
1157 | 314k | while (r = els->next(), r) |
1158 | 313k | { |
1159 | 313k | const char * p = word; |
1160 | 339k | while ((p = strstr(p, r->substr))) { |
1161 | 25.3k | buf.clear(); |
1162 | 25.3k | buf.append(word, p); |
1163 | 25.3k | buf.append(r->repl, strlen(r->repl)); |
1164 | 25.3k | p += strlen(r->substr); |
1165 | 25.3k | buf.append(p, wend + 1); |
1166 | 25.3k | buf.ensure_null_end(); |
1167 | | //COUT.printf("%s (%s) => %s (%s)\n", word, r->substr, buf.str(), r->repl); |
1168 | 25.3k | ScoreInfo inf; |
1169 | 25.3k | inf.word_score = parms->edit_distance_weights.sub*3/2; |
1170 | 25.3k | inf.repl_table = true; |
1171 | 25.3k | try_word(buf.pbegin(), buf.pend(), inf); |
1172 | 25.3k | } |
1173 | 313k | } |
1174 | 740 | } |
1175 | | |
1176 | | // generate an n-gram score comparing s1 and s2 |
1177 | | static int ngram(int n, char * s1, int l1, const char * s2, int l2) |
1178 | 4.48M | { |
1179 | 4.48M | int nscore = 0; |
1180 | 4.48M | int ns; |
1181 | 4.75M | for (int j=1;j<=n;j++) { |
1182 | 4.75M | ns = 0; |
1183 | 64.8M | for (int i=0;i<=(l1-j);i++) { |
1184 | 60.0M | char c = *(s1 + i + j); |
1185 | 60.0M | *(s1 + i + j) = '\0'; |
1186 | 60.0M | if (strstr(s2,(s1+i))) ns++; |
1187 | 60.0M | *(s1 + i + j ) = c; |
1188 | 60.0M | } |
1189 | 4.75M | nscore = nscore + ns; |
1190 | 4.75M | if (ns < 2) break; |
1191 | 4.75M | } |
1192 | 4.48M | ns = 0; |
1193 | 4.48M | ns = (l2-l1)-2; |
1194 | 4.48M | return (nscore - ((ns > 0) ? ns : 0)); |
1195 | 4.48M | } |
1196 | | |
1197 | | struct NGramScore { |
1198 | | SpellerImpl::WS::const_iterator i; |
1199 | | WordEntry info; |
1200 | | const char * soundslike; |
1201 | | int score; |
1202 | 0 | NGramScore() {} |
1203 | | NGramScore(SpellerImpl::WS::const_iterator i0, |
1204 | | const WordEntry & info0, const char * sl, int score0) |
1205 | 2.12k | : i(i0), info(info0), soundslike(sl), score(score0) {} |
1206 | | }; |
1207 | | |
1208 | | |
1209 | | void Working::try_ngram() |
1210 | 89 | { |
1211 | 89 | String original_soundslike = original.soundslike; |
1212 | 89 | original_soundslike.ensure_null_end(); |
1213 | 89 | WordEntry * sw = 0; |
1214 | 89 | const char * sl = 0; |
1215 | 89 | typedef Vector<NGramScore> Candidates; |
1216 | 89 | hash_set<const char *> already_have; |
1217 | 89 | Candidates candidates; |
1218 | 89 | int min_score = 0; |
1219 | 89 | int count = 0; |
1220 | | |
1221 | 89 | for (NearMisses::iterator i = scored_near_misses.begin(); |
1222 | 692k | i != scored_near_misses.end(); ++i) |
1223 | 692k | { |
1224 | 692k | if (!i->soundslike) |
1225 | 0 | i->soundslike = to_soundslike(i->word, strlen(i->word)); |
1226 | 692k | already_have.insert(i->soundslike); |
1227 | 692k | } |
1228 | | |
1229 | 89 | for (SpellerImpl::WS::const_iterator i = sp->suggest_ws.begin(); |
1230 | 534 | i != sp->suggest_ws.end(); |
1231 | 445 | ++i) |
1232 | 445 | { |
1233 | 445 | StackPtr<SoundslikeEnumeration> els((*i)->soundslike_elements()); |
1234 | | |
1235 | 4.52M | while ( (sw = els->next(LARGE_NUM)) ) { |
1236 | | |
1237 | 4.52M | if (sw->what != WordEntry::Word) { |
1238 | 4.52M | abort_temp(); |
1239 | 4.52M | sl = sw->word; |
1240 | 4.52M | } else { |
1241 | 0 | sl = to_soundslike_temp(sw->word, sw->word_size); |
1242 | 0 | } |
1243 | | |
1244 | 4.52M | if (already_have.have(sl)) continue; |
1245 | | |
1246 | 4.48M | int ng = ngram(3, original_soundslike.data(), original_soundslike.size(), |
1247 | 4.48M | sl, strlen(sl)); |
1248 | | |
1249 | 4.48M | if (ng > 0 && ng >= min_score) { |
1250 | 2.12k | commit_temp(sl); |
1251 | 2.12k | candidates.push_back(NGramScore(i, *sw, sl, ng)); |
1252 | 2.12k | if (ng > min_score) count++; |
1253 | 2.12k | if (count >= parms->ngram_keep) { |
1254 | 130 | int orig_min = min_score; |
1255 | 130 | min_score = LARGE_NUM; |
1256 | 130 | Candidates::iterator i = candidates.begin(); |
1257 | 130 | Candidates::iterator j = candidates.begin(); |
1258 | 2.24k | for (; i != candidates.end(); ++i) { |
1259 | 2.11k | if (i->score == orig_min) continue; |
1260 | 1.30k | if (min_score > i->score) min_score = i->score; |
1261 | 1.30k | *j = *i; |
1262 | 1.30k | ++j; |
1263 | 1.30k | } |
1264 | 130 | count = 0; |
1265 | 130 | candidates.resize(j-candidates.begin()); |
1266 | 1.43k | for (i = candidates.begin(); i != candidates.end(); ++i) { |
1267 | 1.30k | if (i->score != min_score) count++; |
1268 | 1.30k | } |
1269 | 130 | } |
1270 | 2.12k | } |
1271 | 4.48M | } |
1272 | 445 | } |
1273 | | |
1274 | 89 | for (Candidates::iterator i = candidates.begin(); |
1275 | 1.39k | i != candidates.end(); |
1276 | 1.30k | ++i) |
1277 | 1.30k | { |
1278 | | //COUT.printf("ngram: %s %d\n", i->soundslike, i->score); |
1279 | 1.30k | add_sound(i->i, &i->info, i->soundslike); |
1280 | 1.30k | } |
1281 | 89 | } |
1282 | | |
1283 | 17.1k | void Working::score_list() { |
1284 | | |
1285 | | # ifdef DEBUG_SUGGEST |
1286 | | COUT.printl("SCORING LIST"); |
1287 | | # endif |
1288 | | |
1289 | 17.1k | try_harder = 3; |
1290 | 17.1k | if (near_misses.empty()) return; |
1291 | | |
1292 | 13.2k | NearMisses::iterator i; |
1293 | 13.2k | NearMisses::iterator prev; |
1294 | | |
1295 | 13.2k | near_misses.push_front(ScoreWordSound(this)); |
1296 | | // the first item will NEVER be looked at. |
1297 | 13.2k | scored_near_misses.push_front(ScoreWordSound(this)); |
1298 | 13.2k | scored_near_misses.front().score = -1; |
1299 | | // this item will only be looked at when sorting so |
1300 | | // make it a small value to keep it at the front. |
1301 | | |
1302 | 13.2k | int try_for = (parms->word_weight*parms->edit_distance_weights.max)/100; |
1303 | 28.1k | while (true) { |
1304 | 28.1k | try_for += (parms->word_weight*parms->edit_distance_weights.max)/100; |
1305 | | |
1306 | | // put all pairs whose score <= initial_limit*max_weight |
1307 | | // into the scored list |
1308 | | |
1309 | 28.1k | prev = near_misses.begin(); |
1310 | 28.1k | i = prev; |
1311 | 28.1k | ++i; |
1312 | 23.8M | while (i != near_misses.end()) { |
1313 | | |
1314 | | //CERR.printf("%s %s %s %d %d\n", i->word, i->word_clean, i->soundslike, |
1315 | | // i->word_score, i->soundslike_score); |
1316 | | |
1317 | 23.8M | if (i->word_score >= LARGE_NUM) { |
1318 | 16.1M | int sl_score = i->soundslike_score < LARGE_NUM ? i->soundslike_score : 0; |
1319 | 16.1M | int level = needed_level(try_for, sl_score); |
1320 | | |
1321 | 16.1M | if (level >= int(sl_score/parms->edit_distance_weights.min)) |
1322 | 13.1M | i->word_score = edit_distance(original.clean, |
1323 | 13.1M | i->word_clean, |
1324 | 13.1M | level, level, |
1325 | 13.1M | parms->edit_distance_weights); |
1326 | 16.1M | } |
1327 | | |
1328 | 23.8M | if (i->word_score >= LARGE_NUM) goto cont1; |
1329 | | |
1330 | 12.5M | if (i->soundslike_score >= LARGE_NUM) |
1331 | 3.33M | { |
1332 | 3.33M | if (weighted_average(0, i->word_score) > try_for) goto cont1; |
1333 | | |
1334 | 221k | if (i->soundslike == 0) i->soundslike = to_soundslike(i->word, strlen(i->word)); |
1335 | | |
1336 | 221k | i->soundslike_score = edit_distance(original.soundslike, i->soundslike, |
1337 | 221k | parms->edit_distance_weights); |
1338 | 221k | } |
1339 | | |
1340 | 9.44M | i->score = weighted_average(i->soundslike_score, i->word_score); |
1341 | | |
1342 | 9.44M | if (i->score > try_for + parms->span) goto cont1; |
1343 | | |
1344 | | //CERR.printf("2>%s %s %s %d %d\n", i->word, i->word_clean, i->soundslike, |
1345 | | // i->word_score, i->soundslike_score); |
1346 | | |
1347 | 2.53M | scored_near_misses.splice_into(near_misses,prev,i); |
1348 | | |
1349 | 2.53M | i = prev; // Yes this is right due to the slice |
1350 | 2.53M | ++i; |
1351 | | |
1352 | 2.53M | continue; |
1353 | | |
1354 | 21.3M | cont1: |
1355 | 21.3M | prev = i; |
1356 | 21.3M | ++i; |
1357 | 21.3M | } |
1358 | | |
1359 | 28.1k | scored_near_misses.sort(); |
1360 | | |
1361 | 28.1k | i = scored_near_misses.begin(); |
1362 | 28.1k | ++i; |
1363 | | |
1364 | 28.1k | if (i == scored_near_misses.end()) continue; |
1365 | | |
1366 | 17.8k | int k = skip_first_couple(i); |
1367 | | |
1368 | 17.8k | if ((k == parms->skip && i->score <= try_for) |
1369 | 17.8k | || prev == near_misses.begin() ) // or no more left in near_misses |
1370 | 13.2k | break; |
1371 | 17.8k | } |
1372 | | |
1373 | 13.2k | threshold = i->score + parms->span; |
1374 | 13.2k | if (threshold < parms->edit_distance_weights.max) |
1375 | 4.63k | threshold = parms->edit_distance_weights.max; |
1376 | | |
1377 | | # ifdef DEBUG_SUGGEST |
1378 | | COUT << "Threshold is: " << threshold << "\n"; |
1379 | | COUT << "try_for: " << try_for << "\n"; |
1380 | | COUT << "Size of scored: " << scored_near_misses.size() << "\n"; |
1381 | | COUT << "Size of ! scored: " << near_misses.size() << "\n"; |
1382 | | # endif |
1383 | | |
1384 | | //if (threshold - try_for <= parms->edit_distance_weights.max/2) return; |
1385 | | |
1386 | 13.2k | prev = near_misses.begin(); |
1387 | 13.2k | i = prev; |
1388 | 13.2k | ++i; |
1389 | 9.70M | while (i != near_misses.end()) { |
1390 | | |
1391 | 9.68M | if (i->word_score >= LARGE_NUM) { |
1392 | | |
1393 | 7.12M | int sl_score = i->soundslike_score < LARGE_NUM ? i->soundslike_score : 0; |
1394 | 7.12M | int initial_level = needed_level(try_for, sl_score); |
1395 | 7.12M | int max_level = needed_level(threshold, sl_score); |
1396 | | |
1397 | 7.12M | if (initial_level < max_level) |
1398 | 4.18M | i->word_score = edit_distance(original.clean.c_str(), |
1399 | 4.18M | i->word_clean, |
1400 | 4.18M | initial_level+1,max_level, |
1401 | 4.18M | parms->edit_distance_weights); |
1402 | 7.12M | } |
1403 | | |
1404 | 9.68M | if (i->word_score >= LARGE_NUM) goto cont2; |
1405 | | |
1406 | 4.33M | if (i->soundslike_score >= LARGE_NUM) |
1407 | 2.53M | { |
1408 | 2.53M | if (weighted_average(0, i->word_score) > threshold) goto cont2; |
1409 | | |
1410 | 56.2k | if (i->soundslike == 0) |
1411 | 56.2k | i->soundslike = to_soundslike(i->word, strlen(i->word)); |
1412 | | |
1413 | 56.2k | i->soundslike_score = edit_distance(original.soundslike, i->soundslike, |
1414 | 56.2k | parms->edit_distance_weights); |
1415 | 56.2k | } |
1416 | | |
1417 | 1.85M | i->score = weighted_average(i->soundslike_score, i->word_score); |
1418 | | |
1419 | 1.85M | if (i->score > threshold + parms->span) goto cont2; |
1420 | | |
1421 | 720k | scored_near_misses.splice_into(near_misses,prev,i); |
1422 | | |
1423 | 720k | i = prev; // Yes this is right due to the slice |
1424 | 720k | ++i; |
1425 | | |
1426 | 720k | continue; |
1427 | | |
1428 | 8.96M | cont2: |
1429 | 8.96M | prev = i; |
1430 | 8.96M | ++i; |
1431 | | |
1432 | 8.96M | } |
1433 | | |
1434 | 13.2k | near_misses.pop_front(); |
1435 | | |
1436 | 13.2k | scored_near_misses.sort(); |
1437 | 13.2k | scored_near_misses.pop_front(); |
1438 | | |
1439 | 13.2k | if (near_misses.empty()) { |
1440 | 6.70k | try_harder = 1; |
1441 | 6.70k | } else { |
1442 | 6.57k | i = scored_near_misses.begin(); |
1443 | 6.57k | skip_first_couple(i); |
1444 | 6.57k | ++i; |
1445 | 6.57k | try_harder = i == scored_near_misses.end() ? 2 : 0; |
1446 | 6.57k | } |
1447 | | |
1448 | | # ifdef DEBUG_SUGGEST |
1449 | | COUT << "Size of scored: " << scored_near_misses.size() << "\n"; |
1450 | | COUT << "Size of ! scored: " << near_misses.size() << "\n"; |
1451 | | COUT << "Try Harder: " << try_harder << "\n"; |
1452 | | # endif |
1453 | 13.2k | } |
1454 | | |
1455 | 15.6k | void Working::fine_tune_score(int thres) { |
1456 | | |
1457 | 15.6k | NearMisses::iterator i; |
1458 | | |
1459 | 15.6k | if (parms->use_typo_analysis) { |
1460 | 15.5k | adj_threshold = 0; |
1461 | 15.5k | unsigned int j; |
1462 | | |
1463 | 15.5k | CharVector orig_norm, word; |
1464 | 15.5k | orig_norm.resize(original.word.size() + 1); |
1465 | 158k | for (j = 0; j != original.word.size(); ++j) |
1466 | 142k | orig_norm[j] = parms->ti->to_normalized(original.word[j]); |
1467 | 15.5k | orig_norm[j] = 0; |
1468 | 15.5k | ParmString orig(orig_norm.data(), j); |
1469 | 15.5k | word.resize(max_word_length + 1); |
1470 | | |
1471 | 15.5k | for (i = scored_near_misses.begin(); |
1472 | 1.03M | i != scored_near_misses.end() && i->score <= thres; |
1473 | 1.01M | ++i) |
1474 | 1.01M | { |
1475 | 1.01M | SpecialTypoScore special = special_typo_score(*parms->ti, i->special_edit); |
1476 | 1.01M | if (special) { |
1477 | 42.2k | i->word_score = special.score; |
1478 | 42.2k | i->soundslike_score = i->word_score; |
1479 | 42.2k | i->adj_score = i->word_score; |
1480 | 42.2k | } |
1481 | 1.01M | if (i->adj_score >= LARGE_NUM) { |
1482 | 776k | if (!special) { |
1483 | 2.50M | for (j = 0; (i->word)[j] != 0; ++j) |
1484 | 1.73M | word[j] = parms->ti->to_normalized((i->word)[j]); |
1485 | 776k | word[j] = 0; |
1486 | 776k | int new_score = typo_edit_distance(ParmString(word.data(), j), orig, *parms->ti); |
1487 | | // if a repl. table was used we don't want to increase the score |
1488 | 776k | if (!i->repl_table || new_score < i->word_score) |
1489 | 776k | i->word_score = new_score; |
1490 | 776k | } |
1491 | 776k | if (!special.is_overall_score) |
1492 | 776k | i->adj_score = adj_wighted_average(i->soundslike_score, i->word_score, parms->ti->max); |
1493 | 776k | } |
1494 | 1.01M | if (i->adj_score > adj_threshold) |
1495 | 52.3k | adj_threshold = i->adj_score; |
1496 | 1.01M | } |
1497 | 15.5k | } else { |
1498 | 188 | for (i = scored_near_misses.begin(); |
1499 | 688k | i != scored_near_misses.end() && i->score <= thres; |
1500 | 688k | ++i) |
1501 | 688k | { |
1502 | 688k | i->adj_score = i->score; |
1503 | 688k | } |
1504 | 188 | adj_threshold = threshold; |
1505 | 188 | } |
1506 | | |
1507 | 1.78M | for (; i != scored_near_misses.end(); ++i) { |
1508 | 1.77M | if (i->adj_score > adj_threshold) |
1509 | 1.76M | i->adj_score = LARGE_NUM; |
1510 | 1.77M | } |
1511 | 15.6k | } |
1512 | | |
1513 | | struct StrEquals { |
1514 | 545k | bool operator() (const char * x, const char * y) const { |
1515 | 545k | return strcmp(x,y) == 0; |
1516 | 545k | } |
1517 | | }; |
1518 | | typedef hash_set<const char *,hash<const char *>,StrEquals> StrHashSet; |
1519 | | |
1520 | 26.2k | char * Working::fix_word(ObjStack & buf, ParmStr w) { |
1521 | 26.2k | size_t sz = prefix.size() + w.size() + suffix.size(); |
1522 | 26.2k | char * word = static_cast<char *>(buf.alloc(sz + 1)); |
1523 | 26.2k | char * i = word; |
1524 | 26.2k | memcpy(i, prefix.c_str(), prefix.size()); |
1525 | 26.2k | i += prefix.size(); |
1526 | 26.2k | memcpy(i, w.str(), w.size() + 1); |
1527 | 26.2k | fix_case(i); |
1528 | 26.2k | i += w.size(); |
1529 | 26.2k | memcpy(i, suffix.c_str(), suffix.size() + 1); |
1530 | 26.2k | return word; |
1531 | 26.2k | } |
1532 | | |
1533 | 7.53k | void Sugs::transfer(SuggestionsImpl & res, int limit) { |
1534 | | // FIXME: double check that conv->in_code() is correct |
1535 | 7.53k | res.reset(); |
1536 | | //# ifdef DEBUG_SUGGEST |
1537 | | // COUT << "\n" << "\n" |
1538 | | // << original.word << '\t' |
1539 | | // << original.soundslike << '\t' |
1540 | | // << "\n"; |
1541 | | // String sl; |
1542 | | //# endif |
1543 | 7.53k | StrHashSet duplicates_check; |
1544 | 7.53k | pair<StrHashSet::iterator, bool> dup_pair; |
1545 | 7.53k | for (NearMisses::const_iterator i = scored_near_misses.begin(); |
1546 | 729k | i != scored_near_misses.end() && res.size() < limit |
1547 | 729k | && ( i->adj_score < LARGE_NUM || res.size() < 3); |
1548 | 722k | ++i) { |
1549 | | # ifdef DEBUG_SUGGEST |
1550 | | //COUT.printf("%p %p: ", i->word, i->soundslike); |
1551 | | COUT << i->word |
1552 | | << '\t' << i->adj_score |
1553 | | << '\t' << i->score |
1554 | | << '\t' << i->word_score |
1555 | | << '\t' << i->soundslike |
1556 | | << '\t' << i->soundslike_score << "\n"; |
1557 | | # endif |
1558 | 722k | Working * src = i->src; |
1559 | 722k | if (i->repl_list != 0) { |
1560 | 0 | do { |
1561 | 0 | const char * word = i->src->fix_word(res.buf, i->repl_list->word); |
1562 | 0 | dup_pair = duplicates_check.insert(word); |
1563 | 0 | if (dup_pair.second) { |
1564 | 0 | const char * pos = strchr(word, ' '); |
1565 | 0 | bool in_dict; |
1566 | 0 | if (pos == NULL) |
1567 | 0 | in_dict = src->sp->check(word); |
1568 | 0 | else |
1569 | 0 | in_dict = src->sp->check(word, pos - word) && src->sp->check(pos + 1); |
1570 | 0 | if (in_dict) |
1571 | 0 | res.push_back(Suggestion(word,&*i)); |
1572 | 0 | } |
1573 | 0 | } while (i->repl_list->adv()); |
1574 | 722k | } else { |
1575 | 722k | char * word = src->have_presuf ? src->fix_word(res.buf, i->word) : src->fix_case(i->word); |
1576 | 722k | dup_pair = duplicates_check.insert(word); |
1577 | 722k | if (dup_pair.second) |
1578 | 498k | res.push_back(Suggestion(word,&*i)); |
1579 | 722k | } |
1580 | 722k | } |
1581 | 15.7k | for (Vector<Working *>::iterator i = srcs.begin(), e = srcs.end(); i != e; ++i) { |
1582 | 8.21k | res.saved_bufs_.push_back((*i)->buffer.freeze()); |
1583 | 8.21k | } |
1584 | 7.53k | res.saved_near_misses_.swap(scored_near_misses); |
1585 | 7.53k | } |
1586 | | |
1587 | | class SuggestionListImpl : public SuggestionList { |
1588 | | struct Parms { |
1589 | | typedef const char * Value; |
1590 | | typedef SuggestionsImpl::const_iterator Iterator; |
1591 | | Iterator end; |
1592 | 7.53k | Parms(Iterator e) : end(e) {} |
1593 | 505k | bool endf(Iterator e) const {return e == end;} |
1594 | 7.53k | Value end_state() const {return 0;} |
1595 | 498k | Value deref(Iterator i) const {return i->word;} |
1596 | | }; |
1597 | | public: |
1598 | | SuggestionsImpl suggestions; |
1599 | | |
1600 | | //SuggestionList * clone() const {return new SuggestionListImpl(*this);} |
1601 | | //void assign(const SuggestionList * other) { |
1602 | | // *this = *static_cast<const SuggestionListImpl *>(other); |
1603 | | //} |
1604 | | |
1605 | 0 | bool empty() const { return suggestions.empty(); } |
1606 | 0 | Size size() const { return suggestions.size(); } |
1607 | 7.53k | VirEmul * elements() const { |
1608 | 7.53k | return new MakeEnumeration<Parms, StringEnumeration> |
1609 | 7.53k | (suggestions.begin(), Parms(suggestions.end())); |
1610 | 7.53k | } |
1611 | | }; |
1612 | | |
1613 | | class SuggestImpl : public Suggest { |
1614 | | SpellerImpl * speller_; |
1615 | | SuggestionListImpl suggestion_list; |
1616 | | SuggestParms parms_; |
1617 | | public: |
1618 | 1.43k | SuggestImpl(SpellerImpl * sp) : speller_(sp) {} |
1619 | | PosibErr<void> setup(String mode = ""); |
1620 | 0 | PosibErr<void> set_mode(ParmString mode) { |
1621 | 0 | return setup(mode); |
1622 | 0 | } |
1623 | | SuggestionList & suggest(const char * word); |
1624 | | SuggestionsData & suggestions(const char * word); |
1625 | | }; |
1626 | | |
1627 | | PosibErr<void> SuggestImpl::setup(String mode) |
1628 | 1.43k | { |
1629 | 1.43k | if (mode == "") |
1630 | 1.43k | mode = speller_->config()->retrieve("sug-mode"); |
1631 | | |
1632 | 1.43k | RET_ON_ERR(parms_.init(mode, speller_, speller_->config())); |
1633 | | |
1634 | 1.42k | return no_err; |
1635 | 1.43k | } |
1636 | | |
1637 | 7.53k | SuggestionList & SuggestImpl::suggest(const char * word) { |
1638 | | # ifdef DEBUG_SUGGEST |
1639 | | COUT << "=========== begin suggest " << word << " ===========\n"; |
1640 | | # endif |
1641 | 7.53k | Working * sug = new Working(speller_, &speller_->lang(),word, &parms_); |
1642 | 7.53k | Sugs * sugs = sug->suggestions(); |
1643 | 7.53k | CheckInfo ci[8]; |
1644 | 7.53k | SpellerImpl::CompoundInfo cpi; |
1645 | 7.53k | String buf = word; |
1646 | 7.53k | char * str = buf.mstr(); |
1647 | 7.53k | speller_->check(str, str + buf.size(), false, speller_->run_together_limit(), ci, ci + 8, NULL, &cpi); |
1648 | 7.53k | if (cpi.count > 1 && cpi.incorrect_count == 1) { |
1649 | 684 | CheckInfo * ci = cpi.first_incorrect; |
1650 | 684 | String prefix(str, ci->word.str - str), middle(ci->word.str, ci->word.len), suffix(ci->word.str + ci->word.len); |
1651 | 684 | sug = new Working(speller_, &speller_->lang(), middle, &parms_); |
1652 | 684 | sug->camel_case = false; |
1653 | 684 | sug->with_presuf(prefix, suffix); |
1654 | 684 | Sugs * sugs2 = sug->suggestions(); |
1655 | 684 | sugs->merge(*sugs2); |
1656 | 684 | delete sugs2; |
1657 | 684 | } |
1658 | 7.53k | sugs->transfer(suggestion_list.suggestions, parms_.limit); |
1659 | 7.53k | delete sugs; |
1660 | | # ifdef DEBUG_SUGGEST |
1661 | | COUT << "^^^^^^^^^^^ end suggest " << word << " ^^^^^^^^^^^\n"; |
1662 | | # endif |
1663 | 7.53k | return suggestion_list; |
1664 | 7.53k | } |
1665 | | |
1666 | 0 | SuggestionsData & SuggestImpl::suggestions(const char * word) { |
1667 | 0 | suggest(word); |
1668 | 0 | return suggestion_list.suggestions; |
1669 | 0 | } |
1670 | | } |
1671 | | |
1672 | | namespace aspeller { |
1673 | 1.43k | PosibErr<Suggest *> new_default_suggest(SpellerImpl * m) { |
1674 | 1.43k | StackPtr<SuggestImpl> s(new SuggestImpl(m)); |
1675 | 1.43k | RET_ON_ERR(s->setup()); |
1676 | 1.42k | return s.release(); |
1677 | 1.43k | } |
1678 | | |
1679 | 1.43k | PosibErr<void> SuggestParms::init(ParmString mode, SpellerImpl * sp) { |
1680 | | |
1681 | 1.43k | edit_distance_weights.del1 = 95; |
1682 | 1.43k | edit_distance_weights.del2 = 95; |
1683 | 1.43k | edit_distance_weights.swap = 90; |
1684 | 1.43k | edit_distance_weights.sub = 100; |
1685 | 1.43k | edit_distance_weights.similar = 10; |
1686 | 1.43k | edit_distance_weights.max = 100; |
1687 | 1.43k | edit_distance_weights.min = 90; |
1688 | | |
1689 | 1.43k | soundslike_weight = 50; |
1690 | | |
1691 | 1.43k | split_chars = " -"; |
1692 | 1.43k | camel_case = false; |
1693 | | |
1694 | 1.43k | skip = 2; |
1695 | 1.43k | limit = 100; |
1696 | 1.43k | span = 50; |
1697 | 1.43k | ngram_keep = 10; |
1698 | 1.43k | use_typo_analysis = true; |
1699 | 1.43k | use_repl_table = sp->have_repl; |
1700 | 1.43k | try_one_edit_word = true; // always a good idea, even when |
1701 | | // soundslike lookup is used |
1702 | 1.43k | check_after_one_edit_word = false; |
1703 | 1.43k | try_scan_0 = false; |
1704 | 1.43k | try_scan_1 = false; |
1705 | 1.43k | try_scan_2 = false; |
1706 | 1.43k | try_ngram = false; |
1707 | 1.43k | ngram_threshold = 2; |
1708 | | |
1709 | 1.43k | if (mode == "ultra") { |
1710 | 4 | try_scan_0 = true; |
1711 | 1.42k | } else if (mode == "fast") { |
1712 | 0 | try_scan_1 = true; |
1713 | 1.42k | } else if (mode == "normal") { |
1714 | 1.40k | try_scan_1 = true; |
1715 | 1.40k | try_scan_2 = true; |
1716 | 1.40k | } else if (mode == "slow") { |
1717 | 12 | try_scan_2 = true; |
1718 | 12 | try_ngram = true; |
1719 | 12 | limit = 1000; |
1720 | 12 | ngram_threshold = sp->have_soundslike ? 1 : 2; |
1721 | 12 | } else if (mode == "bad-spellers") { |
1722 | 6 | try_scan_2 = true; |
1723 | 6 | try_ngram = true; |
1724 | 6 | use_typo_analysis = false; |
1725 | 6 | soundslike_weight = 55; |
1726 | 6 | span = 125; |
1727 | 6 | limit = 1000; |
1728 | 6 | ngram_threshold = 1; |
1729 | 6 | } else { |
1730 | 1 | return make_err(bad_value, "sug-mode", mode, _("one of ultra, fast, normal, slow, or bad-spellers")); |
1731 | 1 | } |
1732 | | |
1733 | 1.42k | if (!sp->have_soundslike) { |
1734 | | // in this case try_scan_0/1 will not get better results than |
1735 | | // try_one_edit_word |
1736 | 0 | if (try_scan_0 || try_scan_1) { |
1737 | 0 | check_after_one_edit_word = true; |
1738 | 0 | try_scan_0 = false; |
1739 | 0 | try_scan_1 = false; |
1740 | 0 | } |
1741 | 0 | } |
1742 | | |
1743 | 1.42k | word_weight = 100 - soundslike_weight; |
1744 | | |
1745 | 1.42k | return no_err; |
1746 | 1.43k | } |
1747 | | |
1748 | 1.43k | PosibErr<void> SuggestParms::init(ParmString mode, SpellerImpl * sp, Config * config) { |
1749 | 1.43k | RET_ON_ERR(init(mode,sp)); |
1750 | | |
1751 | 1.42k | if (config->have("sug-typo-analysis")) |
1752 | 0 | use_typo_analysis = config->retrieve_bool("sug-typo-analysis"); |
1753 | 1.42k | if (config->have("sug-repl-table")) |
1754 | 0 | use_repl_table = config->retrieve_bool("sug-repl-table"); |
1755 | | |
1756 | 1.42k | camel_case = config->retrieve_bool("camel-case"); |
1757 | 1.42k | if (camel_case) |
1758 | 47 | split_chars.clear(); |
1759 | | |
1760 | 1.42k | if (!camel_case || config->have("sug-split-char")) { |
1761 | 1.38k | StringList sl; |
1762 | 1.38k | config->retrieve_list("sug-split-char", &sl); |
1763 | 1.38k | StringListEnumeration els = sl.elements_obj(); |
1764 | 1.38k | const char * s; |
1765 | 1.38k | split_chars.clear(); |
1766 | 4.76k | while ((s = els.next()) != 0) { |
1767 | 3.38k | split_chars.push_back(*s); |
1768 | 3.38k | } |
1769 | 1.38k | } |
1770 | | |
1771 | 1.42k | if (use_typo_analysis) { |
1772 | 1.42k | String keyboard = config->retrieve("keyboard"); |
1773 | 1.42k | RET_ON_ERR(aspeller::setup(ti, config, &sp->lang(), keyboard)); |
1774 | 1.42k | } |
1775 | | |
1776 | 1.42k | return no_err; |
1777 | 1.42k | } |
1778 | | |
1779 | | } |
1780 | | |