Coverage Report

Created: 2026-02-26 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
18.3M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
9.00k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
99.2k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
9.00k
  inline byte flag() const { return achar;  }
110
159k
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
555k
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
465k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
555k
  inline byte flag() const { return achar;  }
133
378M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
182M
{
146
766M
  while( *s1 && (*s1 == *s2) ) {
147
584M
    s1++;
148
584M
    s2++;
149
584M
  }
150
182M
  return (*s1 == '\0');
151
182M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
1.64M
{
156
3.44M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
1.79M
    s1++;
158
1.79M
    end_of_s2--;
159
1.79M
    len --;
160
1.79M
  }
161
1.64M
  return (*s1 == '\0');
162
1.64M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
5.80M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
6.13k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
5.80M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
589k
  size_t hash(const char * s) {return hfun(s);}
198
856k
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
882k
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
564k
{
208
564k
  char * s = str;
209
564k
  char * d = str;
210
2.83M
  while (*s) {
211
2.26M
    if (*s != '[') {
212
2.18M
      *d++ = *s++;
213
2.18M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
79.7k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
79.7k
    } else {
219
79.7k
      *d++ = *s++;
220
79.7k
      if (*s == '^') *d++ = *s++;
221
311k
      while (*s != ']') {
222
231k
        if (*s == '\0' || *s == '[') return -1;
223
231k
        char * min = s;
224
521k
        for (char * i = s + 1; *i != ']'; ++i) {
225
290k
          if ((byte)*i < (byte)*min) min = i;}
226
231k
        char c = *s;
227
231k
        *d++ = *min;
228
231k
        *min = c;
229
231k
        ++s;
230
231k
      }
231
79.7k
      *d++ = *s++;
232
79.7k
    }
233
2.26M
  }
234
564k
  *d = '\0';
235
564k
  return d - str;
236
564k
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
975
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
975
  max_strip_ = 0;
251
250k
  for (int i=0; i < SETSIZE; i++) {
252
249k
    pStart[i] = NULL;
253
249k
    sStart[i] = NULL;
254
249k
    pFlag[i] = NULL;
255
249k
    sFlag[i] = NULL;
256
249k
    max_strip_f[i] = 0;
257
249k
  }
258
975
  return parse_file(affpath, iconv);
259
975
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
975
  : lang(l), data_buf(1024*16) {}
263
264
975
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
751k
{
268
751k
  if (lhs < rhs) lhs = rhs;
269
751k
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
975
{
274
  // io buffers
275
975
  String buf; DataPair dp;
276
277
975
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
975
  affix_file = data_buf.dup(affpath);
281
975
  FStream afflst;
282
975
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
975
  char prev_aff = '\0';
291
292
134k
  while (getdata_pair(afflst,dp,buf)) {
293
133k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
133k
    if (dp.key == "SET") {
298
975
      String buf;
299
975
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
975
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
975
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
132k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
24.0k
      affix_type = dp.key[0];
316
317
133k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
24.0k
    int numents = 0;      // number of affentry structures to parse
324
24.0k
    char achar='\0';      // affix char identifier
325
24.0k
    short xpflg=0;
326
24.0k
    AffEntry * nptr;
327
24.0k
    {
328
      // split affix header line into pieces
329
24.0k
      split(dp);
330
24.0k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
24.0k
      const char * astr = iconv(dp.key);
333
24.0k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
24.0k
      achar = astr[0];
335
24.0k
      if (achar == prev_aff) goto error_count;
336
24.0k
      prev_aff = achar;
337
338
24.0k
      split(dp);
339
24.0k
      if (dp.key.size != 1 || 
340
24.0k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
24.0k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
24.0k
      split(dp);
345
24.0k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
24.0k
      numents = atoi(dp.key); 
349
  
350
588k
      for (int j = 0; j < numents; j++) {
351
564k
        getdata_pair(afflst, dp, buf);
352
353
564k
        if (affix_type == 'P') {
354
9.00k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
9.00k
          new (nptr) PfxEntry;
356
555k
        } else {
357
555k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
555k
          new (nptr) SfxEntry;
359
555k
        }
360
361
564k
        nptr->xpflg = xpflg;
362
363
564k
        split(dp);
364
564k
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
564k
        if (iconv(dp.key)[0] != achar) goto error_count;
367
564k
        nptr->achar = achar;
368
 
369
564k
        split(dp);
370
564k
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
564k
        if (dp.key != "0") {
373
375k
          ParmString s0(iconv(dp.key));
374
375k
          max_(max_strip_, s0.size());
375
375k
          max_(max_strip_f[(byte)achar], s0.size());
376
375k
          nptr->strip = data_buf.dup(s0);
377
375k
          nptr->stripl = s0.size();
378
375k
        } else {
379
188k
          nptr->strip  = "";
380
188k
          nptr->stripl = 0;
381
188k
        }
382
    
383
564k
        split(dp);
384
564k
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
564k
        if (dp.key != "0") {
387
563k
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
563k
          nptr->appndl = strlen(nptr->appnd);
389
563k
        } else {
390
1.16k
          nptr->appnd  = "";
391
1.16k
          nptr->appndl = 0;
392
1.16k
        }
393
    
394
564k
        split(dp);
395
564k
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
564k
        char * cond = iconv(dp.key);
398
564k
        int cond_len = normalize_cond_str(cond);
399
564k
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
564k
        if (nptr->stripl != 0) {
403
375k
          char * cc = cond;
404
375k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
375k
          if (cond_len < nptr->stripl || 
406
375k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
375k
        }
411
564k
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
564k
        if (affix_type == 'P')
416
9.00k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
555k
        else
418
555k
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
564k
      }
420
24.0k
    }
421
24.0k
    continue;
422
24.0k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
24.0k
  }
428
975
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
975
  process_pfx_order();
459
975
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
975
  return no_err;
464
465
975
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
9.00k
{
474
9.00k
  PfxEntry * ptr;
475
9.00k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
9.00k
  const char * key = ep->key();
479
9.00k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
9.00k
  ptr = pFlag[flg];
483
9.00k
  ep->flag_next = ptr;
484
9.00k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
9.00k
  byte sp = *((const byte *)key);
489
9.00k
  ptr = pStart[sp];
490
9.00k
  ep->next = ptr;
491
9.00k
  pStart[sp] = ep;
492
9.00k
  return no_err;
493
9.00k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
555k
{
501
555k
  SfxEntry * ptr;
502
555k
  SfxEntry * ep = sfxptr;
503
555k
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
555k
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
555k
  char * dest = tmp + sfxptr->appndl;
508
555k
  *dest-- = 0;
509
555k
  const char * src = sfxptr->appnd;
510
3.73M
  for (; dest >= tmp; --dest, ++src)
511
3.18M
    *dest = *src;
512
513
  /* get the right starting point */
514
555k
  const char * key = ep->key();
515
555k
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
555k
  ptr = sFlag[flg];
519
555k
  ep->flag_next = ptr;
520
555k
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
555k
  byte sp = *((const byte *)key);
525
555k
  ptr = sStart[sp];
526
555k
  ep->next = ptr;
527
555k
  sStart[sp] = ep;
528
555k
  return no_err;
529
555k
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
975
{
536
975
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
249k
  for (int i=1; i < SETSIZE; i++) {
540
541
248k
    ptr = pStart[i];
542
543
248k
    if (ptr && ptr->next)
544
1.19k
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
257k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
9.00k
      PfxEntry * nptr = ptr->next;
556
11.5k
      for (; nptr != NULL; nptr = nptr->next) {
557
5.25k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
5.25k
      }
559
9.00k
      ptr->next_ne = nptr;
560
9.00k
      ptr->next_eq = NULL;
561
9.00k
      if ((ptr->next) && isSubset(ptr->key() , 
562
2.97k
                                  (ptr->next)->key())) 
563
620
        ptr->next_eq = ptr->next;
564
9.00k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
248k
    ptr = pStart[i];
572
257k
    for (; ptr != NULL; ptr = ptr->next) {
573
9.00k
      PfxEntry * nptr = ptr->next;
574
9.00k
      PfxEntry * mptr = NULL;
575
11.5k
      for (; nptr != NULL; nptr = nptr->next) {
576
5.25k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
2.54k
        mptr = nptr;
578
2.54k
      }
579
9.00k
      if (mptr) mptr->next_ne = NULL;
580
9.00k
    }
581
248k
  }
582
975
  return no_err;
583
975
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
975
{
590
975
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
249k
  for (int i=1; i < SETSIZE; i++) {
594
595
248k
    ptr = sStart[i];
596
597
248k
    if (ptr && ptr->next)
598
8.18k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
802k
    for (; ptr != NULL; ptr = ptr->next) {
608
554k
      SfxEntry * nptr = ptr->next;
609
90.8M
      for (; nptr != NULL; nptr = nptr->next) {
610
90.8M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
90.8M
      }
612
554k
      ptr->next_ne = nptr;
613
554k
      ptr->next_eq = NULL;
614
554k
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
447k
        ptr->next_eq = ptr->next;
616
554k
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
248k
    ptr = sStart[i];
625
802k
    for (; ptr != NULL; ptr = ptr->next) {
626
554k
      SfxEntry * nptr = ptr->next;
627
554k
      SfxEntry * mptr = NULL;
628
90.8M
      for (; nptr != NULL; nptr = nptr->next) {
629
90.8M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
90.3M
        mptr = nptr;
631
90.3M
      }
632
554k
      if (mptr) mptr->next_ne = NULL;
633
554k
    }
634
248k
  }
635
975
  return no_err;
636
975
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
564k
{
646
564k
  byte c;
647
564k
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
564k
  CondsLookup::iterator itr = l.find(cs);
652
564k
  if (!(itr == l.end())) {
653
545k
    ptr->conds = *itr;
654
545k
    return;
655
545k
  }
656
657
18.6k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
18.6k
  cds->str = buf.dup(cs);
659
18.6k
  l.insert(cds);
660
18.6k
  ptr->conds = cds;
661
662
18.6k
  int nc = strlen(cs);
663
18.6k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
18.6k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
18.6k
  int neg = 0;   // complement indicator
671
18.6k
  int grp = 0;   // group indicator
672
18.6k
  int n = 0;     // number of conditions
673
18.6k
  int ec = 0;    // end condition indicator
674
18.6k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
18.6k
  if (strcmp(cs,".")==0) {
678
975
    cds->num = 0;
679
975
    return;
680
975
  }
681
682
17.6k
  i = 0;
683
108k
  while (i < nc) {
684
90.4k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
90.4k
    if (c == '[') {
688
10.0k
      grp = 1;
689
10.0k
      c = 0;
690
10.0k
    }
691
692
    // complement flag
693
90.4k
    if ((grp == 1) && (c == '^')) {
694
7.44k
      neg = 1;
695
7.44k
      c = 0;
696
7.44k
    }
697
698
    // end goup indicator
699
90.4k
    if (c == ']') {
700
10.0k
      ec = 1;
701
10.0k
      c = 0;
702
10.0k
    }
703
704
    // add character of group to list
705
90.4k
    if ((grp == 1) && (c != 0)) {
706
35.8k
      *(mbr + nm) = c;
707
35.8k
      nm++;
708
35.8k
      c = 0;
709
35.8k
    }
710
711
    // end of condition 
712
90.4k
    if (c != 0) {
713
26.9k
      ec = 1;
714
26.9k
    }
715
716
    
717
90.4k
    if (ec) {
718
37.0k
      if (grp == 1) {
719
10.0k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
13.9k
          for (j=0;j<nm;j++) {
722
11.2k
            k = (unsigned int) mbr[j];
723
11.2k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
11.2k
          }
725
7.44k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
1.91M
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
32.0k
          for (j=0;j<nm;j++) {
729
24.6k
            k = (unsigned int) mbr[j];
730
24.6k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
24.6k
          }
732
7.44k
        }
733
10.0k
        neg = 0;
734
10.0k
        grp = 0;   
735
10.0k
        nm = 0;
736
26.9k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
26.9k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
26.9k
        } else {  
743
26.9k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
26.9k
        }
745
26.9k
      }
746
37.0k
      n++;
747
37.0k
      ec = 0;
748
37.0k
    }
749
750
751
90.4k
    i++;
752
90.4k
  }
753
17.6k
  cds->num = n;
754
17.6k
  return;
755
18.6k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
787k
{
762
787k
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
787k
  PfxEntry * pe = pStart[0];
766
787k
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
787k
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
787k
  PfxEntry * pptr = pStart[sp];
774
775
899k
  while (pptr) {
776
111k
    if (isSubset(pptr->key(),word)) {
777
6.99k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
6.99k
      pptr = pptr->next_eq;
779
104k
    } else {
780
104k
      pptr = pptr->next_ne;
781
104k
    }
782
111k
  }
783
    
784
787k
  return false;
785
787k
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
797k
{
793
797k
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
797k
  SfxEntry * se = sStart[0];
797
7.57M
  while (se) {
798
6.78M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
6.78M
    se = se->next;
800
6.78M
  }
801
  
802
  // now handle the general case
803
796k
  byte sp = *((const byte *)(word + word.size() - 1));
804
796k
  SfxEntry * sptr = sStart[sp];
805
806
2.44M
  while (sptr) {
807
1.64M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
876k
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
876k
      sptr = sptr->next_eq;
810
876k
    } else {
811
772k
      sptr = sptr->next_ne;
812
772k
    }
813
1.64M
  }
814
    
815
796k
  return false;
816
796k
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
787k
{
822
787k
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
786k
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
786k
  ParmString pword = word;
827
786k
  ParmString sword = word;
828
786k
  CharVector lower;
829
786k
  if (cp == FirstUpper) {
830
5.91k
    lower.append(word, word.size() + 1);
831
5.91k
    lower[0] = lang->to_lower(word[0]);
832
5.91k
    pword = ParmString(lower.data(), lower.size() - 1);
833
780k
  } else if (cp == AllUpper) {
834
49.1k
    lower.resize(word.size() + 1);
835
49.1k
    unsigned int i = 0;
836
831k
    for (; i != word.size(); ++i)
837
782k
      lower[i] = lang->to_lower(word[i]);
838
49.1k
    lower[i] = '\0';
839
49.1k
    pword = ParmString(lower.data(), lower.size() - 1);
840
49.1k
    sword = pword;
841
49.1k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
786k
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
786k
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
786k
  if (cp == FirstUpper) {
853
5.91k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
780k
  } else {
855
780k
    return false;
856
780k
  }
857
786k
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
961
{
861
961
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
961
  CheckInfo ci;
863
961
  gi->reset();
864
961
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
961
  if (cp == AllUpper) return;
866
936
  if (cp != FirstUpper)
867
934
    prefix_check(li, word, ci, gi, cross);
868
936
  suffix_check(li, word, ci, gi, 0, NULL);
869
936
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
400k
{
874
400k
  byte * empty = (byte *)buf.alloc(1);
875
400k
  *empty = 0;
876
877
400k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
400k
  byte * suf_e = suf;
879
400k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
400k
  byte * csuf_e = csuf;
881
882
400k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
400k
  WordAff * cur = head;
884
400k
  cur->word = buf.dup(word);
885
400k
  cur->aff  = suf;
886
887
400k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
964k
       c != end; 
889
564k
       ++c) 
890
564k
  {
891
564k
    if (sFlag[*c]) *suf_e++ = *c; 
892
564k
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
793k
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
228k
      SimpleString newword = p->add(word, buf);
896
228k
      if (!newword) continue;
897
99.2k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
99.2k
      cur = cur->next;
899
99.2k
      cur->word = newword;
900
99.2k
      cur->aff = p->allow_cross() ? csuf : empty;
901
99.2k
    }
902
564k
  }
903
904
400k
  *suf_e = 0;
905
400k
  *csuf_e = 0;
906
400k
  cur->next = 0;
907
908
400k
  if (limit == 0) return head;
909
910
400k
  WordAff * * end = &cur->next;
911
400k
  WordAff * * very_end = end;
912
400k
  size_t nsuf_s = suf_e - suf + 1;
913
914
899k
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
499k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
499k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
499k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
499k
    (*cur)->aff = nsuf;
919
499k
  }
920
921
400k
  return head;
922
400k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
499k
{
929
499k
  WordAff * head = 0;
930
499k
  if (l) head = **l;
931
499k
  WordAff * * cur = l ? *l : &head;
932
499k
  bool expanded     = false;
933
499k
  bool not_expanded = false;
934
499k
  if (!orig_word) orig_word = word;
935
936
1.08M
  while (*aff) {
937
581k
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
12.2M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
11.6M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
11.6M
        if (!newword) continue;
941
2.22M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
2.22M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
2.22M
        (*cur)->word = newword;
944
2.22M
        (*cur)->aff  = (const byte *)EMPTY;
945
2.22M
        cur = &(*cur)->next;
946
2.22M
        expanded = true;
947
2.22M
      }
948
581k
    }
949
581k
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
581k
    ++aff;
951
581k
  }
952
499k
  *cur = 0;
953
499k
  if (new_aff) *new_aff = 0;
954
499k
  if (l) *l = cur;
955
499k
  return head;
956
499k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
75.2k
{
986
75.2k
  SpellerImpl::WS::const_iterator i = begin;
987
75.2k
  const char * g = 0;
988
75.2k
  if (mode == Word) {
989
54.5k
    do {
990
54.5k
      (*i)->lookup(word, c, o);
991
55.6k
      for (;!o.at_end(); o.adv()) {
992
1.12k
        if (TESTAFF(o.aff, achar))
993
3
          return 1;
994
1.11k
        else
995
1.11k
          g = o.word;
996
1.12k
      }
997
54.5k
      ++i;
998
54.5k
    } while (i != end);
999
59.9k
  } else if (mode == Clean) {
1000
59.8k
    do {
1001
59.8k
      (*i)->clean_lookup(word, o);
1002
73.4k
      for (;!o.at_end(); o.adv()) {
1003
13.7k
        if (TESTAFF(o.aff, achar))
1004
236
          return 1;
1005
13.5k
        else
1006
13.5k
          g = o.word;
1007
13.7k
      }
1008
59.6k
      ++i;
1009
59.6k
    } while (i != end);
1010
59.8k
  } else if (gi) {
1011
61
    g = gi->dup(word);
1012
61
  }
1013
75.0k
  if (gi && g) {
1014
809
    CheckInfo * ci = gi->add();
1015
809
    ci->word = g;
1016
809
    return -1;
1017
809
  }
1018
74.2k
  return 0;
1019
75.0k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
228k
{
1044
228k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
228k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
228k
    const byte * cp = (const byte *) word.str;
1048
279k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
180k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
129k
        break;
1051
180k
    }
1052
228k
    if (cond >= conds->num) {
1053
      /* */
1054
99.2k
      int alen = word.size - stripl;
1055
99.2k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
99.2k
      if (appndl) memcpy(newword, appnd, appndl);
1057
99.2k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
99.2k
      return SimpleString(newword, alen + appndl);
1059
99.2k
    }
1060
228k
  }
1061
129k
  return SimpleString();
1062
228k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
6.99k
{
1069
6.99k
  unsigned int    cond; // condition number being examined
1070
6.99k
  unsigned              tmpl;   // length of tmpword
1071
6.99k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
6.99k
  byte *  cp;   
1073
6.99k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
6.99k
  tmpl = word.size() - appndl;
1081
1082
6.99k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
4.72k
    if (stripl) strcpy (tmpword, strip);
1088
4.72k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
4.72k
    cp = (byte *)tmpword;
1096
5.50k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
2.03k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
2.03k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
4.72k
    if (cond >= conds->num) {
1104
3.46k
      CheckInfo * lci = 0;
1105
3.46k
      CheckInfo * guess = 0;
1106
3.46k
      tmpl += stripl;
1107
1108
3.46k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
3.46k
      if (res == 1) {
1111
1112
3
        lci = &ci;
1113
3
        lci->word = wordinfo.word;
1114
3
        goto quit;
1115
        
1116
3.46k
      } else if (res == -1) {
1117
1118
129
        guess = gi->head;
1119
1120
129
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
3.46k
      if (gi)
1127
2.23k
        lci = gi->head;
1128
      
1129
3.46k
      if (cross && xpflg & XPRODUCT) {
1130
3.46k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
3.46k
                                 ci, gi,
1132
3.46k
                                 XPRODUCT, (AffEntry *)this)) {
1133
1
          lci = &ci;
1134
          
1135
3.46k
        } else if (gi) {
1136
          
1137
2.23k
          CheckInfo * stop = lci;
1138
2.23k
          for (lci = gi->head; 
1139
2.24k
               lci != stop; 
1140
2.23k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
13
          {
1142
13
            lci->pre_flag = achar;
1143
13
            lci->pre_strip_len = stripl;
1144
13
            lci->pre_add_len = appndl;
1145
13
            lci->pre_add = appnd;
1146
13
          }
1147
          
1148
2.23k
        } else {
1149
          
1150
1.23k
          lci = 0;
1151
          
1152
1.23k
        }
1153
3.46k
      }
1154
    
1155
3.46k
      if (guess)
1156
129
        lci = guess;
1157
      
1158
3.46k
    quit:
1159
3.46k
      if (lci) {
1160
133
        lci->pre_flag = achar;
1161
133
        lci->pre_strip_len = stripl;
1162
133
        lci->pre_add_len = appndl;
1163
133
        lci->pre_add = appnd;
1164
133
      }
1165
3.46k
      if (lci == &ci) return true;
1166
3.46k
    }
1167
4.72k
  }
1168
6.99k
  return false;
1169
6.99k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
11.6M
{
1190
11.6M
  int cond;
1191
  /* make sure all conditions match */
1192
11.6M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
8.65M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
14.2M
    for (cond = conds->num; --cond >=0; ) {
1195
11.9M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
6.42M
        break;
1197
11.9M
    }
1198
8.65M
    if (cond < 0) {
1199
2.22M
      int alen = word.size - stripl;
1200
2.22M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
2.22M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
2.22M
      memcpy(newword, word, alen);
1204
2.22M
      memcpy(newword + alen, appnd, appndl + 1);
1205
2.22M
      return SimpleString(newword, alen + appndl);
1206
2.22M
    }
1207
8.65M
  }
1208
9.40M
  return SimpleString();
1209
11.6M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
7.65M
{
1216
7.65M
  unsigned              tmpl;    // length of tmpword 
1217
7.65M
  int     cond;    // condition beng examined
1218
7.65M
  WordEntry             wordinfo;        // hash entry pointer
1219
7.65M
  byte *  cp;
1220
7.65M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
7.65M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
7.65M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
54
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
7.65M
  tmpl = word.size() - appndl;
1235
1236
7.65M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
2.84M
    strcpy (tmpword, word);
1243
2.84M
    cp = (byte *)(tmpword + tmpl);
1244
2.84M
    if (stripl) {
1245
2.79M
      strcpy ((char *)cp, strip);
1246
2.79M
      tmpl += stripl;
1247
2.79M
      cp = (byte *)(tmpword + tmpl);
1248
2.79M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
6.20M
    for (cond = conds->num;  --cond >= 0; ) {
1256
6.13M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
6.13M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
2.84M
    if (cond < 0) {
1263
71.8k
      CheckInfo * lci = 0;
1264
71.8k
      tmpl += stripl;
1265
71.8k
      const SensitiveCompare * cmp = 
1266
71.8k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
71.8k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
71.8k
      if (res == 1
1269
236
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
224
      {
1271
224
        lci = &ci;
1272
224
        lci->word = wordinfo.word;
1273
71.5k
      } else if (res == 1 && gi) {
1274
0
        lci = gi->add();
1275
0
        lci->word = wordinfo.word;
1276
71.5k
      } else if (res == -1) { // gi must be defined
1277
680
        lci = gi->head;
1278
680
      }
1279
1280
71.8k
      if (lci) {
1281
904
        lci->suf_flag = achar;
1282
904
        lci->suf_strip_len = stripl;
1283
904
        lci->suf_add_len = appndl;
1284
904
        lci->suf_add = appnd;
1285
904
      }
1286
      
1287
71.8k
      if (lci == &ci) return true;
1288
71.8k
    }
1289
2.84M
  }
1290
7.65M
  return false;
1291
7.65M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
975
{
1303
975
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
975
  String file;
1307
975
  file += lang->data_dir();
1308
975
  file += '/';
1309
975
  file += lang->name();
1310
975
  file += "_affix.dat";
1311
975
  AffixMgr * affix;
1312
975
  affix = new AffixMgr(lang);
1313
975
  PosibErrBase pe = affix->setup(file, iconv);
1314
975
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
975
  } else {
1318
975
    return affix;
1319
975
  }
1320
975
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/