Coverage Report

Created: 2025-08-26 06:57

/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
40.4M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
5.55k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
67.6k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
5.55k
  inline byte flag() const { return achar;  }
110
752k
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
559k
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
363k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
559k
  inline byte flag() const { return achar;  }
133
398M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
191M
{
146
806M
  while( *s1 && (*s1 == *s2) ) {
147
614M
    s1++;
148
614M
    s2++;
149
614M
  }
150
191M
  return (*s1 == '\0');
151
191M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
3.55M
{
156
7.99M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
4.43M
    s1++;
158
4.43M
    end_of_s2--;
159
4.43M
    len --;
160
4.43M
  }
161
3.55M
  return (*s1 == '\0');
162
3.55M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
6.05M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
5.88k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
6.04M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
584k
  size_t hash(const char * s) {return hfun(s);}
198
875k
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
894k
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
564k
{
208
564k
  char * s = str;
209
564k
  char * d = str;
210
2.90M
  while (*s) {
211
2.34M
    if (*s != '[') {
212
2.27M
      *d++ = *s++;
213
2.27M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
68.6k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
68.6k
    } else {
219
68.6k
      *d++ = *s++;
220
68.6k
      if (*s == '^') *d++ = *s++;
221
258k
      while (*s != ']') {
222
189k
        if (*s == '\0' || *s == '[') return -1;
223
189k
        char * min = s;
224
404k
        for (char * i = s + 1; *i != ']'; ++i) {
225
214k
          if ((byte)*i < (byte)*min) min = i;}
226
189k
        char c = *s;
227
189k
        *d++ = *min;
228
189k
        *min = c;
229
189k
        ++s;
230
189k
      }
231
68.6k
      *d++ = *s++;
232
68.6k
    }
233
2.34M
  }
234
564k
  *d = '\0';
235
564k
  return d - str;
236
564k
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
467
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
467
  max_strip_ = 0;
251
120k
  for (int i=0; i < SETSIZE; i++) {
252
119k
    pStart[i] = NULL;
253
119k
    sStart[i] = NULL;
254
119k
    pFlag[i] = NULL;
255
119k
    sFlag[i] = NULL;
256
119k
    max_strip_f[i] = 0;
257
119k
  }
258
467
  return parse_file(affpath, iconv);
259
467
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
467
  : lang(l), data_buf(1024*16) {}
263
264
467
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
772k
{
268
772k
  if (lhs < rhs) lhs = rhs;
269
772k
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
467
{
274
  // io buffers
275
467
  String buf; DataPair dp;
276
277
467
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
467
  affix_file = data_buf.dup(affpath);
281
467
  FStream afflst;
282
467
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
467
  char prev_aff = '\0';
291
292
77.3k
  while (getdata_pair(afflst,dp,buf)) {
293
76.8k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
76.8k
    if (dp.key == "SET") {
298
467
      String buf;
299
467
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
467
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
467
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
76.4k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
12.4k
      affix_type = dp.key[0];
316
317
76.8k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
12.4k
    int numents = 0;      // number of affentry structures to parse
324
12.4k
    char achar='\0';      // affix char identifier
325
12.4k
    short xpflg=0;
326
12.4k
    AffEntry * nptr;
327
12.4k
    {
328
      // split affix header line into pieces
329
12.4k
      split(dp);
330
12.4k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
12.4k
      const char * astr = iconv(dp.key);
333
12.4k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
12.4k
      achar = astr[0];
335
12.4k
      if (achar == prev_aff) goto error_count;
336
12.4k
      prev_aff = achar;
337
338
12.4k
      split(dp);
339
12.4k
      if (dp.key.size != 1 || 
340
12.4k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
12.4k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
12.4k
      split(dp);
345
12.4k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
12.4k
      numents = atoi(dp.key); 
349
  
350
577k
      for (int j = 0; j < numents; j++) {
351
564k
        getdata_pair(afflst, dp, buf);
352
353
564k
        if (affix_type == 'P') {
354
5.55k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
5.55k
          new (nptr) PfxEntry;
356
559k
        } else {
357
559k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
559k
          new (nptr) SfxEntry;
359
559k
        }
360
361
564k
        nptr->xpflg = xpflg;
362
363
564k
        split(dp);
364
564k
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
564k
        if (iconv(dp.key)[0] != achar) goto error_count;
367
564k
        nptr->achar = achar;
368
 
369
564k
        split(dp);
370
564k
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
564k
        if (dp.key != "0") {
373
386k
          ParmString s0(iconv(dp.key));
374
386k
          max_(max_strip_, s0.size());
375
386k
          max_(max_strip_f[(byte)achar], s0.size());
376
386k
          nptr->strip = data_buf.dup(s0);
377
386k
          nptr->stripl = s0.size();
378
386k
        } else {
379
178k
          nptr->strip  = "";
380
178k
          nptr->stripl = 0;
381
178k
        }
382
    
383
564k
        split(dp);
384
564k
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
564k
        if (dp.key != "0") {
387
563k
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
563k
          nptr->appndl = strlen(nptr->appnd);
389
563k
        } else {
390
1.21k
          nptr->appnd  = "";
391
1.21k
          nptr->appndl = 0;
392
1.21k
        }
393
    
394
564k
        split(dp);
395
564k
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
564k
        char * cond = iconv(dp.key);
398
564k
        int cond_len = normalize_cond_str(cond);
399
564k
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
564k
        if (nptr->stripl != 0) {
403
386k
          char * cc = cond;
404
386k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
386k
          if (cond_len < nptr->stripl || 
406
386k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
386k
        }
411
564k
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
564k
        if (affix_type == 'P')
416
5.55k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
559k
        else
418
559k
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
564k
      }
420
12.4k
    }
421
12.4k
    continue;
422
12.4k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
12.4k
  }
428
467
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
467
  process_pfx_order();
459
467
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
467
  return no_err;
464
465
467
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
5.55k
{
474
5.55k
  PfxEntry * ptr;
475
5.55k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
5.55k
  const char * key = ep->key();
479
5.55k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
5.55k
  ptr = pFlag[flg];
483
5.55k
  ep->flag_next = ptr;
484
5.55k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
5.55k
  byte sp = *((const byte *)key);
489
5.55k
  ptr = pStart[sp];
490
5.55k
  ep->next = ptr;
491
5.55k
  pStart[sp] = ep;
492
5.55k
  return no_err;
493
5.55k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
559k
{
501
559k
  SfxEntry * ptr;
502
559k
  SfxEntry * ep = sfxptr;
503
559k
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
559k
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
559k
  char * dest = tmp + sfxptr->appndl;
508
559k
  *dest-- = 0;
509
559k
  const char * src = sfxptr->appnd;
510
3.82M
  for (; dest >= tmp; --dest, ++src)
511
3.26M
    *dest = *src;
512
513
  /* get the right starting point */
514
559k
  const char * key = ep->key();
515
559k
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
559k
  ptr = sFlag[flg];
519
559k
  ep->flag_next = ptr;
520
559k
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
559k
  byte sp = *((const byte *)key);
525
559k
  ptr = sStart[sp];
526
559k
  ep->next = ptr;
527
559k
  sStart[sp] = ep;
528
559k
  return no_err;
529
559k
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
467
{
536
467
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
119k
  for (int i=1; i < SETSIZE; i++) {
540
541
119k
    ptr = pStart[i];
542
543
119k
    if (ptr && ptr->next)
544
698
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
124k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
5.55k
      PfxEntry * nptr = ptr->next;
556
8.22k
      for (; nptr != NULL; nptr = nptr->next) {
557
4.96k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
4.96k
      }
559
5.55k
      ptr->next_ne = nptr;
560
5.55k
      ptr->next_eq = NULL;
561
5.55k
      if ((ptr->next) && isSubset(ptr->key() , 
562
2.56k
                                  (ptr->next)->key())) 
563
651
        ptr->next_eq = ptr->next;
564
5.55k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
119k
    ptr = pStart[i];
572
124k
    for (; ptr != NULL; ptr = ptr->next) {
573
5.55k
      PfxEntry * nptr = ptr->next;
574
5.55k
      PfxEntry * mptr = NULL;
575
8.22k
      for (; nptr != NULL; nptr = nptr->next) {
576
4.96k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
2.66k
        mptr = nptr;
578
2.66k
      }
579
5.55k
      if (mptr) mptr->next_ne = NULL;
580
5.55k
    }
581
119k
  }
582
467
  return no_err;
583
467
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
467
{
590
467
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
119k
  for (int i=1; i < SETSIZE; i++) {
594
595
119k
    ptr = sStart[i];
596
597
119k
    if (ptr && ptr->next)
598
4.13k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
677k
    for (; ptr != NULL; ptr = ptr->next) {
608
557k
      SfxEntry * nptr = ptr->next;
609
95.3M
      for (; nptr != NULL; nptr = nptr->next) {
610
95.3M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
95.3M
      }
612
557k
      ptr->next_ne = nptr;
613
557k
      ptr->next_eq = NULL;
614
557k
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
456k
        ptr->next_eq = ptr->next;
616
557k
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
119k
    ptr = sStart[i];
625
677k
    for (; ptr != NULL; ptr = ptr->next) {
626
557k
      SfxEntry * nptr = ptr->next;
627
557k
      SfxEntry * mptr = NULL;
628
95.3M
      for (; nptr != NULL; nptr = nptr->next) {
629
95.3M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
94.7M
        mptr = nptr;
631
94.7M
      }
632
557k
      if (mptr) mptr->next_ne = NULL;
633
557k
    }
634
119k
  }
635
467
  return no_err;
636
467
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
564k
{
646
564k
  byte c;
647
564k
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
564k
  CondsLookup::iterator itr = l.find(cs);
652
564k
  if (!(itr == l.end())) {
653
552k
    ptr->conds = *itr;
654
552k
    return;
655
552k
  }
656
657
12.3k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
12.3k
  cds->str = buf.dup(cs);
659
12.3k
  l.insert(cds);
660
12.3k
  ptr->conds = cds;
661
662
12.3k
  int nc = strlen(cs);
663
12.3k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
12.3k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
12.3k
  int neg = 0;   // complement indicator
671
12.3k
  int grp = 0;   // group indicator
672
12.3k
  int n = 0;     // number of conditions
673
12.3k
  int ec = 0;    // end condition indicator
674
12.3k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
12.3k
  if (strcmp(cs,".")==0) {
678
467
    cds->num = 0;
679
467
    return;
680
467
  }
681
682
11.8k
  i = 0;
683
70.6k
  while (i < nc) {
684
58.8k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
58.8k
    if (c == '[') {
688
5.58k
      grp = 1;
689
5.58k
      c = 0;
690
5.58k
    }
691
692
    // complement flag
693
58.8k
    if ((grp == 1) && (c == '^')) {
694
3.92k
      neg = 1;
695
3.92k
      c = 0;
696
3.92k
    }
697
698
    // end goup indicator
699
58.8k
    if (c == ']') {
700
5.58k
      ec = 1;
701
5.58k
      c = 0;
702
5.58k
    }
703
704
    // add character of group to list
705
58.8k
    if ((grp == 1) && (c != 0)) {
706
19.2k
      *(mbr + nm) = c;
707
19.2k
      nm++;
708
19.2k
      c = 0;
709
19.2k
    }
710
711
    // end of condition 
712
58.8k
    if (c != 0) {
713
24.4k
      ec = 1;
714
24.4k
    }
715
716
    
717
58.8k
    if (ec) {
718
30.0k
      if (grp == 1) {
719
5.58k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
8.47k
          for (j=0;j<nm;j++) {
722
6.80k
            k = (unsigned int) mbr[j];
723
6.80k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
6.80k
          }
725
3.92k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
1.00M
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
16.4k
          for (j=0;j<nm;j++) {
729
12.4k
            k = (unsigned int) mbr[j];
730
12.4k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
12.4k
          }
732
3.92k
        }
733
5.58k
        neg = 0;
734
5.58k
        grp = 0;   
735
5.58k
        nm = 0;
736
24.4k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
24.4k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
24.4k
        } else {  
743
24.4k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
24.4k
        }
745
24.4k
      }
746
30.0k
      n++;
747
30.0k
      ec = 0;
748
30.0k
    }
749
750
751
58.8k
    i++;
752
58.8k
  }
753
11.8k
  cds->num = n;
754
11.8k
  return;
755
12.3k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
393k
{
762
393k
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
393k
  PfxEntry * pe = pStart[0];
766
393k
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
393k
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
393k
  PfxEntry * pptr = pStart[sp];
774
775
1.10M
  while (pptr) {
776
710k
    if (isSubset(pptr->key(),word)) {
777
77.7k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
77.5k
      pptr = pptr->next_eq;
779
632k
    } else {
780
632k
      pptr = pptr->next_ne;
781
632k
    }
782
710k
  }
783
    
784
392k
  return false;
785
393k
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
479k
{
793
479k
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
479k
  SfxEntry * se = sStart[0];
797
20.2M
  while (se) {
798
19.7M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
19.7M
    se = se->next;
800
19.7M
  }
801
  
802
  // now handle the general case
803
479k
  byte sp = *((const byte *)(word + word.size() - 1));
804
479k
  SfxEntry * sptr = sStart[sp];
805
806
4.03M
  while (sptr) {
807
3.55M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
2.22M
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
2.22M
      sptr = sptr->next_eq;
810
2.22M
    } else {
811
1.32M
      sptr = sptr->next_ne;
812
1.32M
    }
813
3.55M
  }
814
    
815
478k
  return false;
816
479k
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
392k
{
822
392k
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
392k
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
392k
  ParmString pword = word;
827
392k
  ParmString sword = word;
828
392k
  CharVector lower;
829
392k
  if (cp == FirstUpper) {
830
12.8k
    lower.append(word, word.size() + 1);
831
12.8k
    lower[0] = lang->to_lower(word[0]);
832
12.8k
    pword = ParmString(lower.data(), lower.size() - 1);
833
379k
  } else if (cp == AllUpper) {
834
56.2k
    lower.resize(word.size() + 1);
835
56.2k
    unsigned int i = 0;
836
1.06M
    for (; i != word.size(); ++i)
837
1.00M
      lower[i] = lang->to_lower(word[i]);
838
56.2k
    lower[i] = '\0';
839
56.2k
    pword = ParmString(lower.data(), lower.size() - 1);
840
56.2k
    sword = pword;
841
56.2k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
392k
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
391k
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
391k
  if (cp == FirstUpper) {
853
12.8k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
378k
  } else {
855
378k
    return false;
856
378k
  }
857
391k
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
1.07k
{
861
1.07k
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
1.07k
  CheckInfo ci;
863
1.07k
  gi->reset();
864
1.07k
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
1.07k
  if (cp == AllUpper) return;
866
1.04k
  if (cp != FirstUpper)
867
989
    prefix_check(li, word, ci, gi, cross);
868
1.04k
  suffix_check(li, word, ci, gi, 0, NULL);
869
1.04k
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
309k
{
874
309k
  byte * empty = (byte *)buf.alloc(1);
875
309k
  *empty = 0;
876
877
309k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
309k
  byte * suf_e = suf;
879
309k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
309k
  byte * csuf_e = csuf;
881
882
309k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
309k
  WordAff * cur = head;
884
309k
  cur->word = buf.dup(word);
885
309k
  cur->aff  = suf;
886
887
309k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
741k
       c != end; 
889
431k
       ++c) 
890
431k
  {
891
431k
    if (sFlag[*c]) *suf_e++ = *c; 
892
431k
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
588k
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
157k
      SimpleString newword = p->add(word, buf);
896
157k
      if (!newword) continue;
897
67.6k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
67.6k
      cur = cur->next;
899
67.6k
      cur->word = newword;
900
67.6k
      cur->aff = p->allow_cross() ? csuf : empty;
901
67.6k
    }
902
431k
  }
903
904
309k
  *suf_e = 0;
905
309k
  *csuf_e = 0;
906
309k
  cur->next = 0;
907
908
309k
  if (limit == 0) return head;
909
910
309k
  WordAff * * end = &cur->next;
911
309k
  WordAff * * very_end = end;
912
309k
  size_t nsuf_s = suf_e - suf + 1;
913
914
687k
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
377k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
377k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
377k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
377k
    (*cur)->aff = nsuf;
919
377k
  }
920
921
309k
  return head;
922
309k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
377k
{
929
377k
  WordAff * head = 0;
930
377k
  if (l) head = **l;
931
377k
  WordAff * * cur = l ? *l : &head;
932
377k
  bool expanded     = false;
933
377k
  bool not_expanded = false;
934
377k
  if (!orig_word) orig_word = word;
935
936
824k
  while (*aff) {
937
447k
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
12.7M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
12.3M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
12.3M
        if (!newword) continue;
941
2.22M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
2.22M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
2.22M
        (*cur)->word = newword;
944
2.22M
        (*cur)->aff  = (const byte *)EMPTY;
945
2.22M
        cur = &(*cur)->next;
946
2.22M
        expanded = true;
947
2.22M
      }
948
447k
    }
949
447k
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
447k
    ++aff;
951
447k
  }
952
377k
  *cur = 0;
953
377k
  if (new_aff) *new_aff = 0;
954
377k
  if (l) *l = cur;
955
377k
  return head;
956
377k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
266k
{
986
266k
  SpellerImpl::WS::const_iterator i = begin;
987
266k
  const char * g = 0;
988
266k
  if (mode == Word) {
989
92.2k
    do {
990
92.2k
      (*i)->lookup(word, c, o);
991
95.7k
      for (;!o.at_end(); o.adv()) {
992
3.57k
        if (TESTAFF(o.aff, achar))
993
60
          return 1;
994
3.51k
        else
995
3.51k
          g = o.word;
996
3.57k
      }
997
92.2k
      ++i;
998
92.2k
    } while (i != end);
999
219k
  } else if (mode == Clean) {
1000
219k
    do {
1001
219k
      (*i)->clean_lookup(word, o);
1002
255k
      for (;!o.at_end(); o.adv()) {
1003
36.7k
        if (TESTAFF(o.aff, achar))
1004
832
          return 1;
1005
35.9k
        else
1006
35.9k
          g = o.word;
1007
36.7k
      }
1008
218k
      ++i;
1009
218k
    } while (i != end);
1010
219k
  } else if (gi) {
1011
389
    g = gi->dup(word);
1012
389
  }
1013
266k
  if (gi && g) {
1014
1.79k
    CheckInfo * ci = gi->add();
1015
1.79k
    ci->word = g;
1016
1.79k
    return -1;
1017
1.79k
  }
1018
264k
  return 0;
1019
266k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
157k
{
1044
157k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
157k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
157k
    const byte * cp = (const byte *) word.str;
1048
193k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
125k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
89.7k
        break;
1051
125k
    }
1052
157k
    if (cond >= conds->num) {
1053
      /* */
1054
67.6k
      int alen = word.size - stripl;
1055
67.6k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
67.6k
      if (appndl) memcpy(newword, appnd, appndl);
1057
67.6k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
67.6k
      return SimpleString(newword, alen + appndl);
1059
67.6k
    }
1060
157k
  }
1061
89.7k
  return SimpleString();
1062
157k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
77.7k
{
1069
77.7k
  unsigned int    cond; // condition number being examined
1070
77.7k
  unsigned              tmpl;   // length of tmpword
1071
77.7k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
77.7k
  byte *  cp;   
1073
77.7k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
77.7k
  tmpl = word.size() - appndl;
1081
1082
77.7k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
75.8k
    if (stripl) strcpy (tmpword, strip);
1088
75.8k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
75.8k
    cp = (byte *)tmpword;
1096
75.9k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
2.04k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
2.04k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
75.8k
    if (cond >= conds->num) {
1104
73.8k
      CheckInfo * lci = 0;
1105
73.8k
      CheckInfo * guess = 0;
1106
73.8k
      tmpl += stripl;
1107
1108
73.8k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
73.8k
      if (res == 1) {
1111
1112
136
        lci = &ci;
1113
136
        lci->word = wordinfo.word;
1114
136
        goto quit;
1115
        
1116
73.7k
      } else if (res == -1) {
1117
1118
691
        guess = gi->head;
1119
1120
691
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
73.7k
      if (gi)
1127
4.46k
        lci = gi->head;
1128
      
1129
73.7k
      if (cross && xpflg & XPRODUCT) {
1130
73.7k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
73.7k
                                 ci, gi,
1132
73.7k
                                 XPRODUCT, (AffEntry *)this)) {
1133
0
          lci = &ci;
1134
          
1135
73.7k
        } else if (gi) {
1136
          
1137
4.46k
          CheckInfo * stop = lci;
1138
4.46k
          for (lci = gi->head; 
1139
4.54k
               lci != stop; 
1140
4.46k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
86
          {
1142
86
            lci->pre_flag = achar;
1143
86
            lci->pre_strip_len = stripl;
1144
86
            lci->pre_add_len = appndl;
1145
86
            lci->pre_add = appnd;
1146
86
          }
1147
          
1148
69.2k
        } else {
1149
          
1150
69.2k
          lci = 0;
1151
          
1152
69.2k
        }
1153
73.7k
      }
1154
    
1155
73.7k
      if (guess)
1156
691
        lci = guess;
1157
      
1158
73.8k
    quit:
1159
73.8k
      if (lci) {
1160
2.80k
        lci->pre_flag = achar;
1161
2.80k
        lci->pre_strip_len = stripl;
1162
2.80k
        lci->pre_add_len = appndl;
1163
2.80k
        lci->pre_add = appnd;
1164
2.80k
      }
1165
73.8k
      if (lci == &ci) return true;
1166
73.8k
    }
1167
75.8k
  }
1168
77.5k
  return false;
1169
77.7k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
12.3M
{
1190
12.3M
  int cond;
1191
  /* make sure all conditions match */
1192
12.3M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
9.39M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
17.8M
    for (cond = conds->num; --cond >=0; ) {
1195
15.6M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
7.16M
        break;
1197
15.6M
    }
1198
9.39M
    if (cond < 0) {
1199
2.22M
      int alen = word.size - stripl;
1200
2.22M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
2.22M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
2.22M
      memcpy(newword, word, alen);
1204
2.22M
      memcpy(newword + alen, appnd, appndl + 1);
1205
2.22M
      return SimpleString(newword, alen + appndl);
1206
2.22M
    }
1207
9.39M
  }
1208
10.0M
  return SimpleString();
1209
12.3M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
21.9M
{
1216
21.9M
  unsigned              tmpl;    // length of tmpword 
1217
21.9M
  int     cond;    // condition beng examined
1218
21.9M
  WordEntry             wordinfo;        // hash entry pointer
1219
21.9M
  byte *  cp;
1220
21.9M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
21.9M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
21.9M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
98
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
21.9M
  tmpl = word.size() - appndl;
1235
1236
21.9M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
11.5M
    strcpy (tmpword, word);
1243
11.5M
    cp = (byte *)(tmpword + tmpl);
1244
11.5M
    if (stripl) {
1245
11.2M
      strcpy ((char *)cp, strip);
1246
11.2M
      tmpl += stripl;
1247
11.2M
      cp = (byte *)(tmpword + tmpl);
1248
11.2M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
24.8M
    for (cond = conds->num;  --cond >= 0; ) {
1256
24.6M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
24.6M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
11.5M
    if (cond < 0) {
1263
193k
      CheckInfo * lci = 0;
1264
193k
      tmpl += stripl;
1265
193k
      const SensitiveCompare * cmp = 
1266
193k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
193k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
193k
      if (res == 1
1269
193k
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
684
      {
1271
684
        lci = &ci;
1272
684
        lci->word = wordinfo.word;
1273
192k
      } else if (res == 1 && gi) {
1274
11
        lci = gi->add();
1275
11
        lci->word = wordinfo.word;
1276
192k
      } else if (res == -1) { // gi must be defined
1277
1.10k
        lci = gi->head;
1278
1.10k
      }
1279
1280
193k
      if (lci) {
1281
1.80k
        lci->suf_flag = achar;
1282
1.80k
        lci->suf_strip_len = stripl;
1283
1.80k
        lci->suf_add_len = appndl;
1284
1.80k
        lci->suf_add = appnd;
1285
1.80k
      }
1286
      
1287
193k
      if (lci == &ci) return true;
1288
193k
    }
1289
11.5M
  }
1290
21.9M
  return false;
1291
21.9M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
467
{
1303
467
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
467
  String file;
1307
467
  file += lang->data_dir();
1308
467
  file += '/';
1309
467
  file += lang->name();
1310
467
  file += "_affix.dat";
1311
467
  AffixMgr * affix;
1312
467
  affix = new AffixMgr(lang);
1313
467
  PosibErrBase pe = affix->setup(file, iconv);
1314
467
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
467
  } else {
1318
467
    return affix;
1319
467
  }
1320
467
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/