/src/aspell/modules/speller/default/affix.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of The New Aspell |
2 | | // Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL |
3 | | // license version 2.0 or 2.1. You should have received a copy of the |
4 | | // LGPL license along with this library if you did not you can find it |
5 | | // at http://www.gnu.org/. |
6 | | // |
7 | | // This code is based on the the MySpell affix code: |
8 | | // |
9 | | /* |
10 | | * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And |
11 | | * Contributors. All rights reserved. |
12 | | * |
13 | | * Redistribution and use in source and binary forms, with or without |
14 | | * modification, are permitted provided that the following conditions |
15 | | * are met: |
16 | | * |
17 | | * 1. Redistributions of source code must retain the above copyright |
18 | | * notice, this list of conditions and the following disclaimer. |
19 | | * |
20 | | * 2. Redistributions in binary form must reproduce the above copyright |
21 | | * notice, this list of conditions and the following disclaimer in the |
22 | | * documentation and/or other materials provided with the distribution. |
23 | | * |
24 | | * 3. All modifications to the source code must be clearly marked as |
25 | | * such. Binary redistributions based on modified source code |
26 | | * must be clearly marked as modified versions in the documentation |
27 | | * and/or other materials provided with the distribution. |
28 | | * |
29 | | * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS |
30 | | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
31 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
32 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
33 | | * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
34 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
35 | | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
36 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
37 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
38 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
39 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
40 | | * SUCH DAMAGE. |
41 | | * |
42 | | */ |
43 | | |
44 | | #include <cstdlib> |
45 | | #include <cstring> |
46 | | #include <cstdio> |
47 | | |
48 | | //#include "iostream.hpp" |
49 | | |
50 | | #include "affix.hpp" |
51 | | #include "errors.hpp" |
52 | | #include "getdata.hpp" |
53 | | #include "parm_string.hpp" |
54 | | #include "check_list.hpp" |
55 | | #include "speller_impl.hpp" |
56 | | #include "vararray.hpp" |
57 | | #include "lsort.hpp" |
58 | | #include "hash-t.hpp" |
59 | | |
60 | | #include "gettext.h" |
61 | | |
62 | | using namespace std; |
63 | | |
64 | | namespace aspeller { |
65 | | |
66 | | typedef unsigned char byte; |
67 | | static char EMPTY[1] = {0}; |
68 | | |
69 | | ////////////////////////////////////////////////////////////////////// |
70 | | // |
71 | | // Entry struct definations |
72 | | // |
73 | | |
74 | | struct Conds |
75 | | { |
76 | | char * str; |
77 | | unsigned num; |
78 | | char conds[SETSIZE]; |
79 | 120M | char get(byte i) const {return conds[i];} |
80 | | }; |
81 | | |
82 | | struct AffEntry |
83 | | { |
84 | | const char * appnd; |
85 | | const char * strip; |
86 | | byte appndl; |
87 | | byte stripl; |
88 | | byte xpflg; |
89 | | char achar; |
90 | | const Conds * conds; |
91 | | //unsigned int numconds; |
92 | | //char conds[SETSIZE]; |
93 | | }; |
94 | | |
95 | | // A Prefix Entry |
96 | | |
97 | | struct PfxEntry : public AffEntry |
98 | | { |
99 | | PfxEntry * next; |
100 | | PfxEntry * next_eq; |
101 | | PfxEntry * next_ne; |
102 | | PfxEntry * flag_next; |
103 | 10.9k | PfxEntry() {} |
104 | | |
105 | | bool check(const LookupInfo &, const AffixMgr * pmyMgr, |
106 | | ParmString, CheckInfo &, GuessInfo *, bool cross = true) const; |
107 | | |
108 | 122k | inline bool allow_cross() const { return ((xpflg & XPRODUCT) != 0); } |
109 | 10.9k | inline byte flag() const { return achar; } |
110 | 1.56M | inline const char * key() const { return appnd; } |
111 | | bool applicable(SimpleString) const; |
112 | | SimpleString add(SimpleString, ObjStack & buf) const; |
113 | | }; |
114 | | |
115 | | // A Suffix Entry |
116 | | |
117 | | struct SfxEntry : public AffEntry |
118 | | { |
119 | | const char * rappnd; // this is set in AffixMgr::build_sfxlist |
120 | | |
121 | | SfxEntry * next; |
122 | | SfxEntry * next_eq; |
123 | | SfxEntry * next_ne; |
124 | | SfxEntry * flag_next; |
125 | | |
126 | 917k | SfxEntry() {} |
127 | | |
128 | | bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *, |
129 | | int optflags, AffEntry * ppfx); |
130 | | |
131 | 644k | inline bool allow_cross() const { return ((xpflg & XPRODUCT) != 0); } |
132 | 917k | inline byte flag() const { return achar; } |
133 | 649M | inline const char * key() const { return rappnd; } |
134 | | bool applicable(SimpleString) const; |
135 | | SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const; |
136 | | }; |
137 | | |
138 | | ////////////////////////////////////////////////////////////////////// |
139 | | // |
140 | | // Utility functions declarations |
141 | | // |
142 | | |
143 | | /* return 1 if s1 is subset of s2 */ |
144 | | static bool isSubset(const char * s1, const char * s2) |
145 | 311M | { |
146 | 1.30G | while( *s1 && (*s1 == *s2) ) { |
147 | 994M | s1++; |
148 | 994M | s2++; |
149 | 994M | } |
150 | 311M | return (*s1 == '\0'); |
151 | 311M | } |
152 | | |
153 | | // return 1 if s1 (reversed) is a leading subset of end of s2 |
154 | | static bool isRevSubset(const char * s1, const char * end_of_s2, int len) |
155 | 9.22M | { |
156 | 20.6M | while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) { |
157 | 11.4M | s1++; |
158 | 11.4M | end_of_s2--; |
159 | 11.4M | len --; |
160 | 11.4M | } |
161 | 9.22M | return (*s1 == '\0'); |
162 | 9.22M | } |
163 | | |
164 | | template <class T> |
165 | | struct AffixLess |
166 | | { |
167 | 9.82M | bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;} aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const Line | Count | Source | 167 | 9.80k | bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;} |
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const Line | Count | Source | 167 | 9.81M | bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;} |
|
168 | | }; |
169 | | |
170 | | // struct StringLookup { |
171 | | // struct Parms { |
172 | | // typedef const char * Value; |
173 | | // typedef const char * Key; |
174 | | // static const bool is_multi = false; |
175 | | // hash<const char *> hfun; |
176 | | // size_t hash(const char * s) {return hfun(s);} |
177 | | // bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;} |
178 | | // const char * key(const char * c) {return c;} |
179 | | // }; |
180 | | // typedef HashTable<Parms> Lookup; |
181 | | // Lookup lookup; |
182 | | // ObjStack * data_buf; |
183 | | // StringLookup(ObjStack * b) : data_buf(b) {} |
184 | | // const char * dup(const char * orig) { |
185 | | // pair<Lookup::iterator, bool> res = lookup.insert(orig); |
186 | | // if (res.second) *res.first = data_buf->dup(orig); |
187 | | // return *res.first; |
188 | | // //return data_buf->dup(orig); |
189 | | // } |
190 | | // }; |
191 | | |
192 | | struct CondsLookupParms { |
193 | | typedef const Conds * Value; |
194 | | typedef const char * Key; |
195 | | static const bool is_multi = false; |
196 | | acommon::hash<const char *> hfun; |
197 | 963k | size_t hash(const char * s) {return hfun(s);} |
198 | 1.42M | bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;} |
199 | 1.46M | const char * key(const Conds * c) {return c->str;} |
200 | | }; |
201 | | |
202 | | typedef HashTable<CondsLookupParms> CondsLookup; |
203 | | |
204 | | // normalizes and checks the cond_str |
205 | | // returns the length of the new string or -1 if invalid |
206 | | static int normalize_cond_str(char * str) |
207 | 927k | { |
208 | 927k | char * s = str; |
209 | 927k | char * d = str; |
210 | 4.73M | while (*s) { |
211 | 3.81M | if (*s != '[') { |
212 | 3.69M | *d++ = *s++; |
213 | 3.69M | } else if (s[1] == '\0' || s[1] == ']') { |
214 | 0 | return -1; |
215 | 118k | } else if (s[2] == ']') { |
216 | 0 | *d++ = s[1]; |
217 | 0 | s += 3; |
218 | 118k | } else { |
219 | 118k | *d++ = *s++; |
220 | 118k | if (*s == '^') *d++ = *s++; |
221 | 451k | while (*s != ']') { |
222 | 333k | if (*s == '\0' || *s == '[') return -1; |
223 | 333k | char * min = s; |
224 | 725k | for (char * i = s + 1; *i != ']'; ++i) { |
225 | 392k | if ((byte)*i < (byte)*min) min = i;} |
226 | 333k | char c = *s; |
227 | 333k | *d++ = *min; |
228 | 333k | *min = c; |
229 | 333k | ++s; |
230 | 333k | } |
231 | 118k | *d++ = *s++; |
232 | 118k | } |
233 | 3.81M | } |
234 | 927k | *d = '\0'; |
235 | 927k | return d - str; |
236 | 927k | } |
237 | | |
238 | | static void encodeit(CondsLookup &, ObjStack &, |
239 | | AffEntry * ptr, char * cs); |
240 | | |
241 | | ////////////////////////////////////////////////////////////////////// |
242 | | // |
243 | | // Affix Manager |
244 | | // |
245 | | |
246 | | PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv) |
247 | 1.02k | { |
248 | | // register hash manager and load affix data from aff file |
249 | | //cpdmin = 3; // default value |
250 | 1.02k | max_strip_ = 0; |
251 | 264k | for (int i=0; i < SETSIZE; i++) { |
252 | 263k | pStart[i] = NULL; |
253 | 263k | sStart[i] = NULL; |
254 | 263k | pFlag[i] = NULL; |
255 | 263k | sFlag[i] = NULL; |
256 | 263k | max_strip_f[i] = 0; |
257 | 263k | } |
258 | 1.02k | return parse_file(affpath, iconv); |
259 | 1.02k | } |
260 | | |
261 | | AffixMgr::AffixMgr(const Language * l) |
262 | 1.02k | : lang(l), data_buf(1024*16) {} |
263 | | |
264 | 1.02k | AffixMgr::~AffixMgr() {} |
265 | | |
266 | | static inline void max_(int & lhs, int rhs) |
267 | 1.25M | { |
268 | 1.25M | if (lhs < rhs) lhs = rhs; |
269 | 1.25M | } |
270 | | |
271 | | // read in aff file and build up prefix and suffix entry objects |
272 | | PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv) |
273 | 1.02k | { |
274 | | // io buffers |
275 | 1.02k | String buf; DataPair dp; |
276 | | |
277 | 1.02k | CondsLookup conds_lookup; |
278 | | |
279 | | // open the affix file |
280 | 1.02k | affix_file = data_buf.dup(affpath); |
281 | 1.02k | FStream afflst; |
282 | 1.02k | RET_ON_ERR(afflst.open(affpath,"r")); |
283 | | |
284 | | // step one is to parse the affix file building up the internal |
285 | | // affix data structures |
286 | | |
287 | | // read in each line ignoring any that do not |
288 | | // start with a known line type indicator |
289 | | |
290 | 1.02k | char prev_aff = '\0'; |
291 | | |
292 | 156k | while (getdata_pair(afflst,dp,buf)) { |
293 | 155k | char affix_type = ' '; |
294 | | |
295 | | /* parse in the name of the character set used by the .dict and .aff */ |
296 | | |
297 | 155k | if (dp.key == "SET") { |
298 | 1.02k | String buf; |
299 | 1.02k | encoding = data_buf.dup(fix_encoding_str(dp.value, buf)); |
300 | 1.02k | if (strcmp(encoding, lang->data_encoding()) != 0) |
301 | 0 | return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding); |
302 | 1.02k | } |
303 | | |
304 | | /* parse in the flag used by the controlled compound words */ |
305 | | //else if (d.key == "COMPOUNDFLAG") |
306 | | // compound = data_buf.dup(d.value); |
307 | | |
308 | | /* parse in the flag used by the controlled compound words */ |
309 | | //else if (d.key == "COMPOUNDMIN") |
310 | | // cpdmin = atoi(d.value); // FiXME |
311 | | |
312 | | //else if (dp.key == "TRY" || dp.key == "REP"); |
313 | | |
314 | 154k | else if (dp.key == "PFX" || dp.key == "SFX") |
315 | 26.3k | affix_type = dp.key[0]; |
316 | | |
317 | 155k | if (affix_type == ' ') continue; |
318 | | |
319 | | // |
320 | | // parse this affix: P - prefix, S - suffix |
321 | | // |
322 | | |
323 | 26.3k | int numents = 0; // number of affentry structures to parse |
324 | 26.3k | char achar='\0'; // affix char identifier |
325 | 26.3k | short xpflg=0; |
326 | 26.3k | AffEntry * nptr; |
327 | 26.3k | { |
328 | | // split affix header line into pieces |
329 | 26.3k | split(dp); |
330 | 26.3k | if (dp.key.empty()) goto error; |
331 | | // key is affix char |
332 | 26.3k | const char * astr = iconv(dp.key); |
333 | 26.3k | if (astr[0] == '\0' || astr[1] != '\0') goto error; |
334 | 26.3k | achar = astr[0]; |
335 | 26.3k | if (achar == prev_aff) goto error_count; |
336 | 26.3k | prev_aff = achar; |
337 | | |
338 | 26.3k | split(dp); |
339 | 26.3k | if (dp.key.size != 1 || |
340 | 26.3k | !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error; |
341 | | // key is cross product indicator |
342 | 26.3k | if (dp.key[0] == 'Y') xpflg = XPRODUCT; |
343 | | |
344 | 26.3k | split(dp); |
345 | 26.3k | if (dp.key.empty()) goto error; |
346 | | // key is number of affentries |
347 | | |
348 | 26.3k | numents = atoi(dp.key); |
349 | | |
350 | 954k | for (int j = 0; j < numents; j++) { |
351 | 927k | getdata_pair(afflst, dp, buf); |
352 | | |
353 | 927k | if (affix_type == 'P') { |
354 | 10.9k | nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry)); |
355 | 10.9k | new (nptr) PfxEntry; |
356 | 917k | } else { |
357 | 917k | nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry)); |
358 | 917k | new (nptr) SfxEntry; |
359 | 917k | } |
360 | | |
361 | 927k | nptr->xpflg = xpflg; |
362 | | |
363 | 927k | split(dp); |
364 | 927k | if (dp.key.empty()) goto error; |
365 | | // key is affix charter |
366 | 927k | if (iconv(dp.key)[0] != achar) goto error_count; |
367 | 927k | nptr->achar = achar; |
368 | | |
369 | 927k | split(dp); |
370 | 927k | if (dp.key.empty()) goto error; |
371 | | // key is strip |
372 | 927k | if (dp.key != "0") { |
373 | 629k | ParmString s0(iconv(dp.key)); |
374 | 629k | max_(max_strip_, s0.size()); |
375 | 629k | max_(max_strip_f[(byte)achar], s0.size()); |
376 | 629k | nptr->strip = data_buf.dup(s0); |
377 | 629k | nptr->stripl = s0.size(); |
378 | 629k | } else { |
379 | 298k | nptr->strip = ""; |
380 | 298k | nptr->stripl = 0; |
381 | 298k | } |
382 | | |
383 | 927k | split(dp); |
384 | 927k | if (dp.key.empty()) goto error; |
385 | | // key is affix string or 0 for null |
386 | 927k | if (dp.key != "0") { |
387 | 925k | nptr->appnd = data_buf.dup(iconv(dp.key)); |
388 | 925k | nptr->appndl = strlen(nptr->appnd); |
389 | 925k | } else { |
390 | 1.97k | nptr->appnd = ""; |
391 | 1.97k | nptr->appndl = 0; |
392 | 1.97k | } |
393 | | |
394 | 927k | split(dp); |
395 | 927k | if (dp.key.empty()) goto error; |
396 | | // key is the conditions descriptions |
397 | 927k | char * cond = iconv(dp.key); |
398 | 927k | int cond_len = normalize_cond_str(cond); |
399 | 927k | if (cond_len < 0) |
400 | 0 | return (make_err(invalid_cond, MsgConv(lang)(cond)) |
401 | 0 | .with_file(affix_file, dp.line_num)); |
402 | 927k | if (nptr->stripl != 0) { |
403 | 629k | char * cc = cond; |
404 | 629k | if (affix_type == 'S') cc += cond_len - nptr->stripl; |
405 | 629k | if (cond_len < nptr->stripl || |
406 | 629k | memcmp(cc, nptr->strip, nptr->stripl) != 0) |
407 | 0 | return (make_err(invalid_cond_strip, |
408 | 0 | MsgConv(lang)(cond), MsgConv(lang)(nptr->strip)) |
409 | 0 | .with_file(affix_file, dp.line_num)); |
410 | 629k | } |
411 | 927k | encodeit(conds_lookup, data_buf, nptr, cond); |
412 | | |
413 | | // now create SfxEntry or PfxEntry objects and use links to |
414 | | // build an ordered (sorted by affix string) list |
415 | 927k | if (affix_type == 'P') |
416 | 10.9k | build_pfxlist(static_cast<PfxEntry *>(nptr)); |
417 | 917k | else |
418 | 917k | build_sfxlist(static_cast<SfxEntry *>(nptr)); |
419 | 927k | } |
420 | 26.3k | } |
421 | 26.3k | continue; |
422 | 26.3k | error: |
423 | 0 | return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num); |
424 | 0 | error_count: |
425 | 0 | return make_err(corrupt_affix, MsgConv(lang)(achar), |
426 | 0 | _("Possibly incorrect count.")).with_file(affix_file, dp.line_num); |
427 | 26.3k | } |
428 | 1.02k | afflst.close(); |
429 | | |
430 | | // now we can speed up performance greatly taking advantage of the |
431 | | // relationship between the affixes and the idea of "subsets". |
432 | | |
433 | | // View each prefix as a potential leading subset of another and view |
434 | | // each suffix (reversed) as a potential trailing subset of another. |
435 | | |
436 | | // To illustrate this relationship if we know the prefix "ab" is |
437 | | // found in the word to examine, only prefixes that "ab" is a |
438 | | // leading subset of need be examined. Furthermore is "ab" is not |
439 | | // present then none of the prefixes that "ab" is is a subset need |
440 | | // be examined. |
441 | | |
442 | | // The same argument goes for suffix string that are reversed. |
443 | | |
444 | | // Then to top this off why not examine the first char of the word |
445 | | // to quickly limit the set of prefixes to examine (i.e. the |
446 | | // prefixes to examine must be leading supersets of the first |
447 | | // character of the word (if they exist) |
448 | | |
449 | | // To take advantage of this "subset" relationship, we need to add |
450 | | // two links from entry. One to take next if the current prefix |
451 | | // is found (call it nexteq) and one to take next if the current |
452 | | // prefix is not found (call it nextne). |
453 | | |
454 | | // Since we have built ordered lists, all that remains is to |
455 | | // properly initialize the nextne and nexteq pointers that relate |
456 | | // them |
457 | | |
458 | 1.02k | process_pfx_order(); |
459 | 1.02k | process_sfx_order(); |
460 | | |
461 | | //CERR.printf("%u\n", data_buf.calc_size()/1024); |
462 | | |
463 | 1.02k | return no_err; |
464 | | |
465 | 1.02k | } |
466 | | |
467 | | |
468 | | // we want to be able to quickly access prefix information |
469 | | // both by prefix flag, and sorted by prefix string itself |
470 | | // so we need to set up two indexes |
471 | | |
472 | | PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr) |
473 | 10.9k | { |
474 | 10.9k | PfxEntry * ptr; |
475 | 10.9k | PfxEntry * ep = pfxptr; |
476 | | |
477 | | // get the right starting point |
478 | 10.9k | const char * key = ep->key(); |
479 | 10.9k | const byte flg = ep->flag(); |
480 | | |
481 | | // first index by flag which must exist |
482 | 10.9k | ptr = pFlag[flg]; |
483 | 10.9k | ep->flag_next = ptr; |
484 | 10.9k | pFlag[flg] = ep; |
485 | | |
486 | | // next insert the affix string, it will be sorted latter |
487 | | |
488 | 10.9k | byte sp = *((const byte *)key); |
489 | 10.9k | ptr = pStart[sp]; |
490 | 10.9k | ep->next = ptr; |
491 | 10.9k | pStart[sp] = ep; |
492 | 10.9k | return no_err; |
493 | 10.9k | } |
494 | | |
495 | | // we want to be able to quickly access suffix information |
496 | | // both by suffix flag, and sorted by the reverse of the |
497 | | // suffix string itself; so we need to set up two indexes |
498 | | |
499 | | PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr) |
500 | 917k | { |
501 | 917k | SfxEntry * ptr; |
502 | 917k | SfxEntry * ep = sfxptr; |
503 | 917k | char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1); |
504 | 917k | sfxptr->rappnd = tmp; |
505 | | |
506 | | // reverse the string |
507 | 917k | char * dest = tmp + sfxptr->appndl; |
508 | 917k | *dest-- = 0; |
509 | 917k | const char * src = sfxptr->appnd; |
510 | 6.24M | for (; dest >= tmp; --dest, ++src) |
511 | 5.33M | *dest = *src; |
512 | | |
513 | | /* get the right starting point */ |
514 | 917k | const char * key = ep->key(); |
515 | 917k | const byte flg = ep->flag(); |
516 | | |
517 | | // first index by flag which must exist |
518 | 917k | ptr = sFlag[flg]; |
519 | 917k | ep->flag_next = ptr; |
520 | 917k | sFlag[flg] = ep; |
521 | | |
522 | | // next insert the affix string, it will be sorted latter |
523 | | |
524 | 917k | byte sp = *((const byte *)key); |
525 | 917k | ptr = sStart[sp]; |
526 | 917k | ep->next = ptr; |
527 | 917k | sStart[sp] = ep; |
528 | 917k | return no_err; |
529 | 917k | } |
530 | | |
531 | | |
532 | | |
533 | | // initialize the PfxEntry links NextEQ and NextNE to speed searching |
534 | | PosibErr<void> AffixMgr::process_pfx_order() |
535 | 1.02k | { |
536 | 1.02k | PfxEntry* ptr; |
537 | | |
538 | | // loop through each prefix list starting point |
539 | 263k | for (int i=1; i < SETSIZE; i++) { |
540 | | |
541 | 262k | ptr = pStart[i]; |
542 | | |
543 | 262k | if (ptr && ptr->next) |
544 | 1.40k | ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>()); |
545 | | |
546 | | // look through the remainder of the list |
547 | | // and find next entry with affix that |
548 | | // the current one is not a subset of |
549 | | // mark that as destination for NextNE |
550 | | // use next in list that you are a subset |
551 | | // of as NextEQ |
552 | | |
553 | 273k | for (; ptr != NULL; ptr = ptr->next) { |
554 | | |
555 | 10.9k | PfxEntry * nptr = ptr->next; |
556 | 15.2k | for (; nptr != NULL; nptr = nptr->next) { |
557 | 8.30k | if (! isSubset( ptr->key() , nptr->key() )) break; |
558 | 8.30k | } |
559 | 10.9k | ptr->next_ne = nptr; |
560 | 10.9k | ptr->next_eq = NULL; |
561 | 10.9k | if ((ptr->next) && isSubset(ptr->key() , |
562 | 4.42k | (ptr->next)->key())) |
563 | 1.05k | ptr->next_eq = ptr->next; |
564 | 10.9k | } |
565 | | |
566 | | // now clean up by adding smart search termination strings |
567 | | // if you are already a superset of the previous prefix |
568 | | // but not a subset of the next, search can end here |
569 | | // so set NextNE properly |
570 | | |
571 | 262k | ptr = pStart[i]; |
572 | 273k | for (; ptr != NULL; ptr = ptr->next) { |
573 | 10.9k | PfxEntry * nptr = ptr->next; |
574 | 10.9k | PfxEntry * mptr = NULL; |
575 | 15.2k | for (; nptr != NULL; nptr = nptr->next) { |
576 | 8.30k | if (! isSubset(ptr->key(),nptr->key())) break; |
577 | 4.31k | mptr = nptr; |
578 | 4.31k | } |
579 | 10.9k | if (mptr) mptr->next_ne = NULL; |
580 | 10.9k | } |
581 | 262k | } |
582 | 1.02k | return no_err; |
583 | 1.02k | } |
584 | | |
585 | | |
586 | | |
587 | | // initialize the SfxEntry links NextEQ and NextNE to speed searching |
588 | | PosibErr<void> AffixMgr::process_sfx_order() |
589 | 1.02k | { |
590 | 1.02k | SfxEntry* ptr; |
591 | | |
592 | | // loop through each prefix list starting point |
593 | 263k | for (int i=1; i < SETSIZE; i++) { |
594 | | |
595 | 262k | ptr = sStart[i]; |
596 | | |
597 | 262k | if (ptr && ptr->next) |
598 | 8.87k | ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>()); |
599 | | |
600 | | // look through the remainder of the list |
601 | | // and find next entry with affix that |
602 | | // the current one is not a subset of |
603 | | // mark that as destination for NextNE |
604 | | // use next in list that you are a subset |
605 | | // of as NextEQ |
606 | | |
607 | 1.17M | for (; ptr != NULL; ptr = ptr->next) { |
608 | 915k | SfxEntry * nptr = ptr->next; |
609 | 154M | for (; nptr != NULL; nptr = nptr->next) { |
610 | 154M | if (! isSubset(ptr->key(),nptr->key())) break; |
611 | 154M | } |
612 | 915k | ptr->next_ne = nptr; |
613 | 915k | ptr->next_eq = NULL; |
614 | 915k | if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) |
615 | 745k | ptr->next_eq = ptr->next; |
616 | 915k | } |
617 | | |
618 | | |
619 | | // now clean up by adding smart search termination strings: |
620 | | // if you are already a superset of the previous suffix |
621 | | // but not a subset of the next, search can end here |
622 | | // so set NextNE properly |
623 | | |
624 | 262k | ptr = sStart[i]; |
625 | 1.17M | for (; ptr != NULL; ptr = ptr->next) { |
626 | 915k | SfxEntry * nptr = ptr->next; |
627 | 915k | SfxEntry * mptr = NULL; |
628 | 154M | for (; nptr != NULL; nptr = nptr->next) { |
629 | 154M | if (! isSubset(ptr->key(),nptr->key())) break; |
630 | 153M | mptr = nptr; |
631 | 153M | } |
632 | 915k | if (mptr) mptr->next_ne = NULL; |
633 | 915k | } |
634 | 262k | } |
635 | 1.02k | return no_err; |
636 | 1.02k | } |
637 | | |
638 | | // takes aff file condition string and creates the |
639 | | // conds array - please see the appendix at the end of the |
640 | | // file affentry.cxx which describes what is going on here |
641 | | // in much more detail |
642 | | |
643 | | static void encodeit(CondsLookup & l, ObjStack & buf, |
644 | | AffEntry * ptr, char * cs) |
645 | 927k | { |
646 | 927k | byte c; |
647 | 927k | int i, j, k; |
648 | | |
649 | | // see if we already have this conds matrix |
650 | | |
651 | 927k | CondsLookup::iterator itr = l.find(cs); |
652 | 927k | if (!(itr == l.end())) { |
653 | 904k | ptr->conds = *itr; |
654 | 904k | return; |
655 | 904k | } |
656 | | |
657 | 23.4k | Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds)); |
658 | 23.4k | cds->str = buf.dup(cs); |
659 | 23.4k | l.insert(cds); |
660 | 23.4k | ptr->conds = cds; |
661 | | |
662 | 23.4k | int nc = strlen(cs); |
663 | 23.4k | VARARRAYM(byte, mbr, nc + 1, MAXLNLEN); |
664 | | |
665 | | // now clear the conditions array |
666 | 23.4k | memset(cds->conds, 0, sizeof(cds->conds)); |
667 | | |
668 | | // now parse the string to create the conds array |
669 | | |
670 | 23.4k | int neg = 0; // complement indicator |
671 | 23.4k | int grp = 0; // group indicator |
672 | 23.4k | int n = 0; // number of conditions |
673 | 23.4k | int ec = 0; // end condition indicator |
674 | 23.4k | int nm = 0; // number of member in group |
675 | | |
676 | | // if no condition just return |
677 | 23.4k | if (strcmp(cs,".")==0) { |
678 | 1.02k | cds->num = 0; |
679 | 1.02k | return; |
680 | 1.02k | } |
681 | | |
682 | 22.4k | i = 0; |
683 | 135k | while (i < nc) { |
684 | 112k | c = *((byte *)(cs + i)); |
685 | | |
686 | | // start group indicator |
687 | 112k | if (c == '[') { |
688 | 11.5k | grp = 1; |
689 | 11.5k | c = 0; |
690 | 11.5k | } |
691 | | |
692 | | // complement flag |
693 | 112k | if ((grp == 1) && (c == '^')) { |
694 | 8.25k | neg = 1; |
695 | 8.25k | c = 0; |
696 | 8.25k | } |
697 | | |
698 | | // end goup indicator |
699 | 112k | if (c == ']') { |
700 | 11.5k | ec = 1; |
701 | 11.5k | c = 0; |
702 | 11.5k | } |
703 | | |
704 | | // add character of group to list |
705 | 112k | if ((grp == 1) && (c != 0)) { |
706 | 40.2k | *(mbr + nm) = c; |
707 | 40.2k | nm++; |
708 | 40.2k | c = 0; |
709 | 40.2k | } |
710 | | |
711 | | // end of condition |
712 | 112k | if (c != 0) { |
713 | 41.4k | ec = 1; |
714 | 41.4k | } |
715 | | |
716 | | |
717 | 112k | if (ec) { |
718 | 52.9k | if (grp == 1) { |
719 | 11.5k | if (neg == 0) { |
720 | | // set the proper bits in the condition array vals for those chars |
721 | 16.7k | for (j=0;j<nm;j++) { |
722 | 13.4k | k = (unsigned int) mbr[j]; |
723 | 13.4k | cds->conds[k] = cds->conds[k] | (1 << n); |
724 | 13.4k | } |
725 | 8.25k | } else { |
726 | | // complement so set all of them and then unset indicated ones |
727 | 2.12M | for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n); |
728 | 35.0k | for (j=0;j<nm;j++) { |
729 | 26.7k | k = (unsigned int) mbr[j]; |
730 | 26.7k | cds->conds[k] = cds->conds[k] & ~(1 << n); |
731 | 26.7k | } |
732 | 8.25k | } |
733 | 11.5k | neg = 0; |
734 | 11.5k | grp = 0; |
735 | 11.5k | nm = 0; |
736 | 41.4k | } else { |
737 | | // not a group so just set the proper bit for this char |
738 | | // but first handle special case of . inside condition |
739 | 41.4k | if (c == '.') { |
740 | | // wild card character so set them all |
741 | 0 | for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n); |
742 | 41.4k | } else { |
743 | 41.4k | cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n); |
744 | 41.4k | } |
745 | 41.4k | } |
746 | 52.9k | n++; |
747 | 52.9k | ec = 0; |
748 | 52.9k | } |
749 | | |
750 | | |
751 | 112k | i++; |
752 | 112k | } |
753 | 22.4k | cds->num = n; |
754 | 22.4k | return; |
755 | 23.4k | } |
756 | | |
757 | | |
758 | | // check word for prefixes |
759 | | bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, |
760 | | CheckInfo & ci, GuessInfo * gi, bool cross) const |
761 | 742k | { |
762 | 742k | if (word.empty()) return false; |
763 | | |
764 | | // first handle the special case of 0 length prefixes |
765 | 742k | PfxEntry * pe = pStart[0]; |
766 | 742k | while (pe) { |
767 | 0 | if (pe->check(linf,this,word,ci,gi)) return true; |
768 | 0 | pe = pe->next; |
769 | 0 | } |
770 | | |
771 | | // now handle the general case |
772 | 742k | byte sp = *reinterpret_cast<const byte *>(word.str()); |
773 | 742k | PfxEntry * pptr = pStart[sp]; |
774 | | |
775 | 2.23M | while (pptr) { |
776 | 1.49M | if (isSubset(pptr->key(),word)) { |
777 | 155k | if (pptr->check(linf,this,word,ci,gi,cross)) return true; |
778 | 155k | pptr = pptr->next_eq; |
779 | 1.33M | } else { |
780 | 1.33M | pptr = pptr->next_ne; |
781 | 1.33M | } |
782 | 1.49M | } |
783 | | |
784 | 742k | return false; |
785 | 742k | } |
786 | | |
787 | | |
788 | | // check word for suffixes |
789 | | bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, |
790 | | CheckInfo & ci, GuessInfo * gi, |
791 | | int sfxopts, AffEntry * ppfx) const |
792 | 927k | { |
793 | 927k | if (word.empty()) return false; |
794 | | |
795 | | // first handle the special case of 0 length suffixes |
796 | 927k | SfxEntry * se = sStart[0]; |
797 | 43.1M | while (se) { |
798 | 42.2M | if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true; |
799 | 42.2M | se = se->next; |
800 | 42.2M | } |
801 | | |
802 | | // now handle the general case |
803 | 927k | byte sp = *((const byte *)(word + word.size() - 1)); |
804 | 927k | SfxEntry * sptr = sStart[sp]; |
805 | | |
806 | 10.1M | while (sptr) { |
807 | 9.22M | if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) { |
808 | 6.56M | if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true; |
809 | 6.56M | sptr = sptr->next_eq; |
810 | 6.56M | } else { |
811 | 2.66M | sptr = sptr->next_ne; |
812 | 2.66M | } |
813 | 9.22M | } |
814 | | |
815 | 926k | return false; |
816 | 927k | } |
817 | | |
818 | | // check if word with affixes is correctly spelled |
819 | | bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, |
820 | | CheckInfo & ci, GuessInfo * gi) const |
821 | 742k | { |
822 | 742k | if (word.empty()) return false; |
823 | | |
824 | | // Deal With Case in a semi-intelligent manner |
825 | 741k | CasePattern cp = lang->LangImpl::case_pattern(word); |
826 | 741k | ParmString pword = word; |
827 | 741k | ParmString sword = word; |
828 | 741k | CharVector lower; |
829 | 741k | if (cp == FirstUpper) { |
830 | 36.9k | lower.append(word, word.size() + 1); |
831 | 36.9k | lower[0] = lang->to_lower(word[0]); |
832 | 36.9k | pword = ParmString(lower.data(), lower.size() - 1); |
833 | 704k | } else if (cp == AllUpper) { |
834 | 55.8k | lower.resize(word.size() + 1); |
835 | 55.8k | unsigned int i = 0; |
836 | 2.00M | for (; i != word.size(); ++i) |
837 | 1.94M | lower[i] = lang->to_lower(word[i]); |
838 | 55.8k | lower[i] = '\0'; |
839 | 55.8k | pword = ParmString(lower.data(), lower.size() - 1); |
840 | 55.8k | sword = pword; |
841 | 55.8k | } |
842 | | |
843 | | // check all prefixes (also crossed with suffixes if allowed) |
844 | 741k | if (prefix_check(linf, pword, ci, gi)) return true; |
845 | | |
846 | | // if still not found check all suffixes |
847 | 741k | if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true; |
848 | | |
849 | | // if still not found check again but with the lower case version |
850 | | // which can make a difference if the entire word matches the cond |
851 | | // string |
852 | 740k | if (cp == FirstUpper) { |
853 | 36.9k | return suffix_check(linf, pword, ci, gi, 0, NULL); |
854 | 703k | } else { |
855 | 703k | return false; |
856 | 703k | } |
857 | 740k | } |
858 | | |
859 | | void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const |
860 | 1.56k | { |
861 | 1.56k | LookupInfo li(0, LookupInfo::AlwaysTrue); |
862 | 1.56k | CheckInfo ci; |
863 | 1.56k | gi->reset(); |
864 | 1.56k | CasePattern cp = lang->LangImpl::case_pattern(word); |
865 | 1.56k | if (cp == AllUpper) return; |
866 | 1.55k | if (cp != FirstUpper) |
867 | 1.18k | prefix_check(li, word, ci, gi, cross); |
868 | 1.55k | suffix_check(li, word, ci, gi, 0, NULL); |
869 | 1.55k | } |
870 | | |
871 | | WordAff * AffixMgr::expand(ParmString word, ParmString aff, |
872 | | ObjStack & buf, int limit) const |
873 | 534k | { |
874 | 534k | byte * empty = (byte *)buf.alloc(1); |
875 | 534k | *empty = 0; |
876 | | |
877 | 534k | byte * suf = (byte *)buf.alloc(aff.size() + 1); |
878 | 534k | byte * suf_e = suf; |
879 | 534k | byte * csuf = (byte *)buf.alloc(aff.size() + 1); |
880 | 534k | byte * csuf_e = csuf; |
881 | | |
882 | 534k | WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff)); |
883 | 534k | WordAff * cur = head; |
884 | 534k | cur->word = buf.dup(word); |
885 | 534k | cur->aff = suf; |
886 | | |
887 | 534k | for (const byte * c = (const byte *)aff.str(), * end = c + aff.size(); |
888 | 1.30M | c != end; |
889 | 767k | ++c) |
890 | 767k | { |
891 | 767k | if (sFlag[*c]) *suf_e++ = *c; |
892 | 767k | if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c; |
893 | | |
894 | 1.05M | for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) { |
895 | 285k | SimpleString newword = p->add(word, buf); |
896 | 285k | if (!newword) continue; |
897 | 122k | cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff)); |
898 | 122k | cur = cur->next; |
899 | 122k | cur->word = newword; |
900 | 122k | cur->aff = p->allow_cross() ? csuf : empty; |
901 | 122k | } |
902 | 767k | } |
903 | | |
904 | 534k | *suf_e = 0; |
905 | 534k | *csuf_e = 0; |
906 | 534k | cur->next = 0; |
907 | | |
908 | 534k | if (limit == 0) return head; |
909 | | |
910 | 534k | WordAff * * end = &cur->next; |
911 | 534k | WordAff * * very_end = end; |
912 | 534k | size_t nsuf_s = suf_e - suf + 1; |
913 | | |
914 | 1.19M | for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) { |
915 | 657k | if ((int)(*cur)->word.size - max_strip_ >= limit) continue; |
916 | 657k | byte * nsuf = (byte *)buf.alloc(nsuf_s); |
917 | 657k | expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word); |
918 | 657k | (*cur)->aff = nsuf; |
919 | 657k | } |
920 | | |
921 | 534k | return head; |
922 | 534k | } |
923 | | |
924 | | WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, |
925 | | ObjStack & buf, int limit, |
926 | | byte * new_aff, WordAff * * * l, |
927 | | ParmString orig_word) const |
928 | 657k | { |
929 | 657k | WordAff * head = 0; |
930 | 657k | if (l) head = **l; |
931 | 657k | WordAff * * cur = l ? *l : &head; |
932 | 657k | bool expanded = false; |
933 | 657k | bool not_expanded = false; |
934 | 657k | if (!orig_word) orig_word = word; |
935 | | |
936 | 1.46M | while (*aff) { |
937 | 803k | if ((int)word.size() - max_strip_f[*aff] < limit) { |
938 | 25.4M | for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) { |
939 | 24.6M | SimpleString newword = p->add(word, buf, limit, orig_word); |
940 | 24.6M | if (!newword) continue; |
941 | 4.64M | if (newword == EMPTY) {not_expanded = true; continue;} |
942 | 4.64M | *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff)); |
943 | 4.64M | (*cur)->word = newword; |
944 | 4.64M | (*cur)->aff = (const byte *)EMPTY; |
945 | 4.64M | cur = &(*cur)->next; |
946 | 4.64M | expanded = true; |
947 | 4.64M | } |
948 | 803k | } |
949 | 803k | if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff; |
950 | 803k | ++aff; |
951 | 803k | } |
952 | 657k | *cur = 0; |
953 | 657k | if (new_aff) *new_aff = 0; |
954 | 657k | if (l) *l = cur; |
955 | 657k | return head; |
956 | 657k | } |
957 | | |
958 | | CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const |
959 | 0 | { |
960 | 0 | CheckAffixRes res = InvalidAffix; |
961 | | |
962 | 0 | for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) { |
963 | 0 | res = InapplicableAffix; |
964 | 0 | if (p->applicable(word)) return ValidAffix; |
965 | 0 | } |
966 | | |
967 | 0 | for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) { |
968 | 0 | if (res == InvalidAffix) res = InapplicableAffix; |
969 | 0 | if (p->applicable(word)) return ValidAffix; |
970 | 0 | } |
971 | | |
972 | 0 | return res; |
973 | 0 | } |
974 | | |
975 | | |
976 | | |
977 | | ////////////////////////////////////////////////////////////////////// |
978 | | // |
979 | | // LookupInfo |
980 | | // |
981 | | |
982 | | int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, |
983 | | char achar, |
984 | | WordEntry & o, GuessInfo * gi) const |
985 | 603k | { |
986 | 603k | SpellerImpl::WS::const_iterator i = begin; |
987 | 603k | const char * g = 0; |
988 | 603k | if (mode == Word) { |
989 | 165k | do { |
990 | 165k | (*i)->lookup(word, c, o); |
991 | 168k | for (;!o.at_end(); o.adv()) { |
992 | 2.52k | if (TESTAFF(o.aff, achar)) |
993 | 236 | return 1; |
994 | 2.28k | else |
995 | 2.28k | g = o.word; |
996 | 2.52k | } |
997 | 165k | ++i; |
998 | 165k | } while (i != end); |
999 | 528k | } else if (mode == Clean) { |
1000 | 528k | do { |
1001 | 528k | (*i)->clean_lookup(word, o); |
1002 | 540k | for (;!o.at_end(); o.adv()) { |
1003 | 13.1k | if (TESTAFF(o.aff, achar)) |
1004 | 788 | return 1; |
1005 | 12.3k | else |
1006 | 12.3k | g = o.word; |
1007 | 13.1k | } |
1008 | 527k | ++i; |
1009 | 527k | } while (i != end); |
1010 | 528k | } else if (gi) { |
1011 | 501 | g = gi->dup(word); |
1012 | 501 | } |
1013 | 602k | if (gi && g) { |
1014 | 1.60k | CheckInfo * ci = gi->add(); |
1015 | 1.60k | ci->word = g; |
1016 | 1.60k | return -1; |
1017 | 1.60k | } |
1018 | 600k | return 0; |
1019 | 602k | } |
1020 | | |
1021 | | ////////////////////////////////////////////////////////////////////// |
1022 | | // |
1023 | | // Affix Entry |
1024 | | // |
1025 | | |
1026 | | bool PfxEntry::applicable(SimpleString word) const |
1027 | 0 | { |
1028 | 0 | unsigned int cond; |
1029 | | /* make sure all conditions match */ |
1030 | 0 | if ((word.size > stripl) && (word.size >= conds->num)) { |
1031 | 0 | const byte * cp = (const byte *) word.str; |
1032 | 0 | for (cond = 0; cond < conds->num; cond++) { |
1033 | 0 | if ((conds->get(*cp++) & (1 << cond)) == 0) |
1034 | 0 | break; |
1035 | 0 | } |
1036 | 0 | if (cond >= conds->num) return true; |
1037 | 0 | } |
1038 | 0 | return false; |
1039 | 0 | } |
1040 | | |
1041 | | // add prefix to this word assuming conditions hold |
1042 | | SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const |
1043 | 285k | { |
1044 | 285k | unsigned int cond; |
1045 | | /* make sure all conditions match */ |
1046 | 285k | if ((word.size > stripl) && (word.size >= conds->num)) { |
1047 | 285k | const byte * cp = (const byte *) word.str; |
1048 | 350k | for (cond = 0; cond < conds->num; cond++) { |
1049 | 227k | if ((conds->get(*cp++) & (1 << cond)) == 0) |
1050 | 163k | break; |
1051 | 227k | } |
1052 | 285k | if (cond >= conds->num) { |
1053 | | /* */ |
1054 | 122k | int alen = word.size - stripl; |
1055 | 122k | char * newword = (char *)buf.alloc(alen + appndl + 1); |
1056 | 122k | if (appndl) memcpy(newword, appnd, appndl); |
1057 | 122k | memcpy(newword + appndl, word + stripl, alen + 1); |
1058 | 122k | return SimpleString(newword, alen + appndl); |
1059 | 122k | } |
1060 | 285k | } |
1061 | 163k | return SimpleString(); |
1062 | 285k | } |
1063 | | |
1064 | | // check if this prefix entry matches |
1065 | | bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr, |
1066 | | ParmString word, |
1067 | | CheckInfo & ci, GuessInfo * gi, bool cross) const |
1068 | 155k | { |
1069 | 155k | unsigned int cond; // condition number being examined |
1070 | 155k | unsigned tmpl; // length of tmpword |
1071 | 155k | WordEntry wordinfo; // hash entry of root word or NULL |
1072 | 155k | byte * cp; |
1073 | 155k | VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1); |
1074 | | |
1075 | | // on entry prefix is 0 length or already matches the beginning of the word. |
1076 | | // So if the remaining root word has positive length |
1077 | | // and if there are enough chars in root word and added back strip chars |
1078 | | // to meet the number of characters conditions, then test it |
1079 | | |
1080 | 155k | tmpl = word.size() - appndl; |
1081 | | |
1082 | 155k | if ((tmpl > 0) && (tmpl + stripl >= conds->num)) { |
1083 | | |
1084 | | // generate new root word by removing prefix and adding |
1085 | | // back any characters that would have been stripped |
1086 | | |
1087 | 153k | if (stripl) strcpy (tmpword, strip); |
1088 | 153k | strcpy ((tmpword + stripl), (word + appndl)); |
1089 | | |
1090 | | // now make sure all of the conditions on characters |
1091 | | // are met. Please see the appendix at the end of |
1092 | | // this file for more info on exactly what is being |
1093 | | // tested |
1094 | | |
1095 | 153k | cp = (byte *)tmpword; |
1096 | 154k | for (cond = 0; cond < conds->num; cond++) { |
1097 | 7.49k | if ((conds->get(*cp++) & (1 << cond)) == 0) break; |
1098 | 7.49k | } |
1099 | | |
1100 | | // if all conditions are met then check if resulting |
1101 | | // root word in the dictionary |
1102 | | |
1103 | 153k | if (cond >= conds->num) { |
1104 | 147k | CheckInfo * lci = 0; |
1105 | 147k | CheckInfo * guess = 0; |
1106 | 147k | tmpl += stripl; |
1107 | | |
1108 | 147k | int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi); |
1109 | | |
1110 | 147k | if (res == 1) { |
1111 | | |
1112 | 1 | lci = &ci; |
1113 | 1 | lci->word = wordinfo.word; |
1114 | 1 | goto quit; |
1115 | | |
1116 | 147k | } else if (res == -1) { |
1117 | | |
1118 | 184 | guess = gi->head; |
1119 | | |
1120 | 184 | } |
1121 | | |
1122 | | // prefix matched but no root word was found |
1123 | | // if XPRODUCT is allowed, try again but now |
1124 | | // cross checked combined with a suffix |
1125 | | |
1126 | 147k | if (gi) |
1127 | 6.61k | lci = gi->head; |
1128 | | |
1129 | 147k | if (cross && xpflg & XPRODUCT) { |
1130 | 147k | if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), |
1131 | 147k | ci, gi, |
1132 | 147k | XPRODUCT, (AffEntry *)this)) { |
1133 | 26 | lci = &ci; |
1134 | | |
1135 | 147k | } else if (gi) { |
1136 | | |
1137 | 6.61k | CheckInfo * stop = lci; |
1138 | 6.61k | for (lci = gi->head; |
1139 | 6.76k | lci != stop; |
1140 | 6.61k | lci = const_cast<CheckInfo *>(lci->next)) |
1141 | 152 | { |
1142 | 152 | lci->pre_flag = achar; |
1143 | 152 | lci->pre_strip_len = stripl; |
1144 | 152 | lci->pre_add_len = appndl; |
1145 | 152 | lci->pre_add = appnd; |
1146 | 152 | } |
1147 | | |
1148 | 140k | } else { |
1149 | | |
1150 | 140k | lci = 0; |
1151 | | |
1152 | 140k | } |
1153 | 147k | } |
1154 | | |
1155 | 147k | if (guess) |
1156 | 184 | lci = guess; |
1157 | | |
1158 | 147k | quit: |
1159 | 147k | if (lci) { |
1160 | 852 | lci->pre_flag = achar; |
1161 | 852 | lci->pre_strip_len = stripl; |
1162 | 852 | lci->pre_add_len = appndl; |
1163 | 852 | lci->pre_add = appnd; |
1164 | 852 | } |
1165 | 147k | if (lci == &ci) return true; |
1166 | 147k | } |
1167 | 153k | } |
1168 | 155k | return false; |
1169 | 155k | } |
1170 | | |
1171 | | bool SfxEntry::applicable(SimpleString word) const |
1172 | 0 | { |
1173 | 0 | int cond; |
1174 | | /* make sure all conditions match */ |
1175 | 0 | if ((word.size > stripl) && (word.size >= conds->num)) { |
1176 | 0 | const byte * cp = (const byte *) (word + word.size); |
1177 | 0 | for (cond = conds->num; --cond >=0; ) { |
1178 | 0 | if ((conds->get(*--cp) & (1 << cond)) == 0) |
1179 | 0 | break; |
1180 | 0 | } |
1181 | 0 | if (cond < 0) return true; |
1182 | 0 | } |
1183 | 0 | return false; |
1184 | 0 | } |
1185 | | |
1186 | | // add suffix to this word assuming conditions hold |
1187 | | SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, |
1188 | | int limit, SimpleString orig_word) const |
1189 | 24.6M | { |
1190 | 24.6M | int cond; |
1191 | | /* make sure all conditions match */ |
1192 | 24.6M | if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) { |
1193 | 18.9M | const byte * cp = (const byte *) (orig_word + orig_word.size); |
1194 | 37.3M | for (cond = conds->num; --cond >=0; ) { |
1195 | 32.6M | if ((conds->get(*--cp) & (1 << cond)) == 0) |
1196 | 14.3M | break; |
1197 | 32.6M | } |
1198 | 18.9M | if (cond < 0) { |
1199 | 4.64M | int alen = word.size - stripl; |
1200 | 4.64M | if (alen >= limit) return EMPTY; |
1201 | | /* we have a match so add suffix */ |
1202 | 4.64M | char * newword = (char *)buf.alloc(alen + appndl + 1); |
1203 | 4.64M | memcpy(newword, word, alen); |
1204 | 4.64M | memcpy(newword + alen, appnd, appndl + 1); |
1205 | 4.64M | return SimpleString(newword, alen + appndl); |
1206 | 4.64M | } |
1207 | 18.9M | } |
1208 | 20.0M | return SimpleString(); |
1209 | 24.6M | } |
1210 | | |
1211 | | // see if this suffix is present in the word |
1212 | | bool SfxEntry::check(const LookupInfo & linf, ParmString word, |
1213 | | CheckInfo & ci, GuessInfo * gi, |
1214 | | int optflags, AffEntry* ppfx) |
1215 | 48.8M | { |
1216 | 48.8M | unsigned tmpl; // length of tmpword |
1217 | 48.8M | int cond; // condition beng examined |
1218 | 48.8M | WordEntry wordinfo; // hash entry pointer |
1219 | 48.8M | byte * cp; |
1220 | 48.8M | VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1); |
1221 | 48.8M | PfxEntry* ep = (PfxEntry *) ppfx; |
1222 | | |
1223 | | // if this suffix is being cross checked with a prefix |
1224 | | // but it does not support cross products skip it |
1225 | | |
1226 | 48.8M | if ((optflags & XPRODUCT) != 0 && (xpflg & XPRODUCT) == 0) |
1227 | 203 | return false; |
1228 | | |
1229 | | // upon entry suffix is 0 length or already matches the end of the word. |
1230 | | // So if the remaining root word has positive length |
1231 | | // and if there are enough chars in root word and added back strip chars |
1232 | | // to meet the number of characters conditions, then test it |
1233 | | |
1234 | 48.8M | tmpl = word.size() - appndl; |
1235 | | |
1236 | 48.8M | if ((tmpl > 0) && (tmpl + stripl >= conds->num)) { |
1237 | | |
1238 | | // generate new root word by removing suffix and adding |
1239 | | // back any characters that would have been stripped or |
1240 | | // or null terminating the shorter string |
1241 | | |
1242 | 41.3M | strcpy (tmpword, word); |
1243 | 41.3M | cp = (byte *)(tmpword + tmpl); |
1244 | 41.3M | if (stripl) { |
1245 | 39.5M | strcpy ((char *)cp, strip); |
1246 | 39.5M | tmpl += stripl; |
1247 | 39.5M | cp = (byte *)(tmpword + tmpl); |
1248 | 39.5M | } else *cp = '\0'; |
1249 | | |
1250 | | // now make sure all of the conditions on characters |
1251 | | // are met. Please see the appendix at the end of |
1252 | | // this file for more info on exactly what is being |
1253 | | // tested |
1254 | | |
1255 | 87.6M | for (cond = conds->num; --cond >= 0; ) { |
1256 | 87.1M | if ((conds->get(*--cp) & (1 << cond)) == 0) break; |
1257 | 87.1M | } |
1258 | | |
1259 | | // if all conditions are met then check if resulting |
1260 | | // root word in the dictionary |
1261 | | |
1262 | 41.3M | if (cond < 0) { |
1263 | 456k | CheckInfo * lci = 0; |
1264 | 456k | tmpl += stripl; |
1265 | 456k | const SensitiveCompare * cmp = |
1266 | 456k | optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin; |
1267 | 456k | int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi); |
1268 | 456k | if (res == 1 |
1269 | 456k | && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar))) |
1270 | 907 | { |
1271 | 907 | lci = &ci; |
1272 | 907 | lci->word = wordinfo.word; |
1273 | 455k | } else if (res == 1 && gi) { |
1274 | 40 | lci = gi->add(); |
1275 | 40 | lci->word = wordinfo.word; |
1276 | 455k | } else if (res == -1) { // gi must be defined |
1277 | 1.42k | lci = gi->head; |
1278 | 1.42k | } |
1279 | | |
1280 | 456k | if (lci) { |
1281 | 2.37k | lci->suf_flag = achar; |
1282 | 2.37k | lci->suf_strip_len = stripl; |
1283 | 2.37k | lci->suf_add_len = appndl; |
1284 | 2.37k | lci->suf_add = appnd; |
1285 | 2.37k | } |
1286 | | |
1287 | 456k | if (lci == &ci) return true; |
1288 | 456k | } |
1289 | 41.3M | } |
1290 | 48.8M | return false; |
1291 | 48.8M | } |
1292 | | |
1293 | | ////////////////////////////////////////////////////////////////////// |
1294 | | // |
1295 | | // new_affix_mgr |
1296 | | // |
1297 | | |
1298 | | |
1299 | | PosibErr<AffixMgr *> new_affix_mgr(ParmString name, |
1300 | | Conv & iconv, |
1301 | | const Language * lang) |
1302 | 1.02k | { |
1303 | 1.02k | if (name == "none") |
1304 | 0 | return 0; |
1305 | | //CERR << "NEW AFFIX MGR\n"; |
1306 | 1.02k | String file; |
1307 | 1.02k | file += lang->data_dir(); |
1308 | 1.02k | file += '/'; |
1309 | 1.02k | file += lang->name(); |
1310 | 1.02k | file += "_affix.dat"; |
1311 | 1.02k | AffixMgr * affix; |
1312 | 1.02k | affix = new AffixMgr(lang); |
1313 | 1.02k | PosibErrBase pe = affix->setup(file, iconv); |
1314 | 1.02k | if (pe.has_err()) { |
1315 | 0 | delete affix; |
1316 | 0 | return pe; |
1317 | 1.02k | } else { |
1318 | 1.02k | return affix; |
1319 | 1.02k | } |
1320 | 1.02k | } |
1321 | | } |
1322 | | |
1323 | | /************************************************************************** |
1324 | | |
1325 | | Appendix: Understanding Affix Code |
1326 | | |
1327 | | |
1328 | | An affix is either a prefix or a suffix attached to root words to make |
1329 | | other words. |
1330 | | |
1331 | | Basically a Prefix or a Suffix is set of AffEntry objects |
1332 | | which store information about the prefix or suffix along |
1333 | | with supporting routines to check if a word has a particular |
1334 | | prefix or suffix or a combination. |
1335 | | |
1336 | | The structure affentry is defined as follows: |
1337 | | |
1338 | | struct AffEntry |
1339 | | { |
1340 | | unsigned char achar; // char used to represent the affix |
1341 | | char * strip; // string to strip before adding affix |
1342 | | char * appnd; // the affix string to add |
1343 | | short stripl; // length of the strip string |
1344 | | short appndl; // length of the affix string |
1345 | | short numconds; // the number of conditions that must be met |
1346 | | short xpflg; // flag: XPRODUCT- combine both prefix and suffix |
1347 | | char conds[SETSIZE]; // array which encodes the conditions to be met |
1348 | | }; |
1349 | | |
1350 | | |
1351 | | Here is a suffix borrowed from the en_US.aff file. This file |
1352 | | is whitespace delimited. |
1353 | | |
1354 | | SFX D Y 4 |
1355 | | SFX D 0 e d |
1356 | | SFX D y ied [^aeiou]y |
1357 | | SFX D 0 ed [^ey] |
1358 | | SFX D 0 ed [aeiou]y |
1359 | | |
1360 | | This information can be interpreted as follows: |
1361 | | |
1362 | | In the first line has 4 fields |
1363 | | |
1364 | | Field |
1365 | | ----- |
1366 | | 1 SFX - indicates this is a suffix |
1367 | | 2 D - is the name of the character flag which represents this suffix |
1368 | | 3 Y - indicates it can be combined with prefixes (cross product) |
1369 | | 4 4 - indicates that sequence of 4 affentry structures are needed to |
1370 | | properly store the affix information |
1371 | | |
1372 | | The remaining lines describe the unique information for the 4 SfxEntry |
1373 | | objects that make up this affix. Each line can be interpreted |
1374 | | as follows: (note fields 1 and 2 are as a check against line 1 info) |
1375 | | |
1376 | | Field |
1377 | | ----- |
1378 | | 1 SFX - indicates this is a suffix |
1379 | | 2 D - is the name of the character flag for this affix |
1380 | | 3 y - the string of chars to strip off before adding affix |
1381 | | (a 0 here indicates the NULL string) |
1382 | | 4 ied - the string of affix characters to add |
1383 | | 5 [^aeiou]y - the conditions which must be met before the affix |
1384 | | can be applied |
1385 | | |
1386 | | Field 5 is interesting. Since this is a suffix, field 5 tells us that |
1387 | | there are 2 conditions that must be met. The first condition is that |
1388 | | the next to the last character in the word must *NOT* be any of the |
1389 | | following "a", "e", "i", "o" or "u". The second condition is that |
1390 | | the last character of the word must end in "y". |
1391 | | |
1392 | | So how can we encode this information concisely and be able to |
1393 | | test for both conditions in a fast manner? The answer is found |
1394 | | but studying the wonderful ispell code of Geoff Kuenning, et.al. |
1395 | | (now available under a normal BSD license). |
1396 | | |
1397 | | If we set up a conds array of 256 bytes indexed (0 to 255) and access it |
1398 | | using a character (cast to an unsigned char) of a string, we have 8 bits |
1399 | | of information we can store about that character. Specifically we |
1400 | | could use each bit to say if that character is allowed in any of the |
1401 | | last (or first for prefixes) 8 characters of the word. |
1402 | | |
1403 | | Basically, each character at one end of the word (up to the number |
1404 | | of conditions) is used to index into the conds array and the resulting |
1405 | | value found there says whether the that character is valid for a |
1406 | | specific character position in the word. |
1407 | | |
1408 | | For prefixes, it does this by setting bit 0 if that char is valid |
1409 | | in the first position, bit 1 if valid in the second position, and so on. |
1410 | | |
1411 | | If a bit is not set, then that char is not valid for that position in the |
1412 | | word. |
1413 | | |
1414 | | If working with suffixes bit 0 is used for the character closest |
1415 | | to the front, bit 1 for the next character towards the end, ..., |
1416 | | with bit numconds-1 representing the last char at the end of the string. |
1417 | | |
1418 | | Note: since entries in the conds[] are 8 bits, only 8 conditions |
1419 | | (read that only 8 character positions) can be examined at one |
1420 | | end of a word (the beginning for prefixes and the end for suffixes. |
1421 | | |
1422 | | So to make this clearer, lets encode the conds array values for the |
1423 | | first two affentries for the suffix D described earlier. |
1424 | | |
1425 | | |
1426 | | For the first affentry: |
1427 | | numconds = 1 (only examine the last character) |
1428 | | |
1429 | | conds['e'] = (1 << 0) (the word must end in an E) |
1430 | | all others are all 0 |
1431 | | |
1432 | | For the second affentry: |
1433 | | numconds = 2 (only examine the last two characters) |
1434 | | |
1435 | | conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) |
1436 | | where X is all characters *but* a, e, i, o, or u |
1437 | | |
1438 | | |
1439 | | conds['y'] = (1 << 1) (the last char must be a y) |
1440 | | all other bits for all other entries in the conds array are zero |
1441 | | |
1442 | | |
1443 | | **************************************************************************/ |