Coverage Report

Created: 2026-05-30 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
78.3M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
9.06k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
61.6k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
9.06k
  inline byte flag() const { return achar;  }
110
1.08M
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
730k
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
303k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
730k
  inline byte flag() const { return achar;  }
133
520M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
246M
{
146
1.03G
  while( *s1 && (*s1 == *s2) ) {
147
790M
    s1++;
148
790M
    s2++;
149
790M
  }
150
246M
  return (*s1 == '\0');
151
246M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
11.9M
{
156
26.8M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
14.9M
    s1++;
158
14.9M
    end_of_s2--;
159
14.9M
    len --;
160
14.9M
  }
161
11.9M
  return (*s1 == '\0');
162
11.9M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
7.80M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
7.84k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
7.79M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
768k
  size_t hash(const char * s) {return hfun(s);}
198
1.13M
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
1.16M
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
739k
{
208
739k
  char * s = str;
209
739k
  char * d = str;
210
3.77M
  while (*s) {
211
3.03M
    if (*s != '[') {
212
2.93M
      *d++ = *s++;
213
2.93M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
95.7k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
95.7k
    } else {
219
95.7k
      *d++ = *s++;
220
95.7k
      if (*s == '^') *d++ = *s++;
221
365k
      while (*s != ']') {
222
270k
        if (*s == '\0' || *s == '[') return -1;
223
270k
        char * min = s;
224
590k
        for (char * i = s + 1; *i != ']'; ++i) {
225
320k
          if ((byte)*i < (byte)*min) min = i;}
226
270k
        char c = *s;
227
270k
        *d++ = *min;
228
270k
        *min = c;
229
270k
        ++s;
230
270k
      }
231
95.7k
      *d++ = *s++;
232
95.7k
    }
233
3.03M
  }
234
739k
  *d = '\0';
235
739k
  return d - str;
236
739k
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
875
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
875
  max_strip_ = 0;
251
224k
  for (int i=0; i < SETSIZE; i++) {
252
224k
    pStart[i] = NULL;
253
224k
    sStart[i] = NULL;
254
224k
    pFlag[i] = NULL;
255
224k
    sFlag[i] = NULL;
256
224k
    max_strip_f[i] = 0;
257
224k
  }
258
875
  return parse_file(affpath, iconv);
259
875
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
875
  : lang(l), data_buf(1024*16) {}
263
264
875
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
1.00M
{
268
1.00M
  if (lhs < rhs) lhs = rhs;
269
1.00M
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
875
{
274
  // io buffers
275
875
  String buf; DataPair dp;
276
277
875
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
875
  affix_file = data_buf.dup(affpath);
281
875
  FStream afflst;
282
875
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
875
  char prev_aff = '\0';
291
292
131k
  while (getdata_pair(afflst,dp,buf)) {
293
130k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
130k
    if (dp.key == "SET") {
298
875
      String buf;
299
875
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
875
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
875
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
129k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
22.2k
      affix_type = dp.key[0];
316
317
130k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
22.2k
    int numents = 0;      // number of affentry structures to parse
324
22.2k
    char achar='\0';      // affix char identifier
325
22.2k
    short xpflg=0;
326
22.2k
    AffEntry * nptr;
327
22.2k
    {
328
      // split affix header line into pieces
329
22.2k
      split(dp);
330
22.2k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
22.2k
      const char * astr = iconv(dp.key);
333
22.2k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
22.2k
      achar = astr[0];
335
22.2k
      if (achar == prev_aff) goto error_count;
336
22.2k
      prev_aff = achar;
337
338
22.2k
      split(dp);
339
22.2k
      if (dp.key.size != 1 || 
340
22.2k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
22.2k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
22.2k
      split(dp);
345
22.2k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
22.2k
      numents = atoi(dp.key); 
349
  
350
762k
      for (int j = 0; j < numents; j++) {
351
739k
        getdata_pair(afflst, dp, buf);
352
353
739k
        if (affix_type == 'P') {
354
9.06k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
9.06k
          new (nptr) PfxEntry;
356
730k
        } else {
357
730k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
730k
          new (nptr) SfxEntry;
359
730k
        }
360
361
739k
        nptr->xpflg = xpflg;
362
363
739k
        split(dp);
364
739k
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
739k
        if (iconv(dp.key)[0] != achar) goto error_count;
367
739k
        nptr->achar = achar;
368
 
369
739k
        split(dp);
370
739k
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
739k
        if (dp.key != "0") {
373
500k
          ParmString s0(iconv(dp.key));
374
500k
          max_(max_strip_, s0.size());
375
500k
          max_(max_strip_f[(byte)achar], s0.size());
376
500k
          nptr->strip = data_buf.dup(s0);
377
500k
          nptr->stripl = s0.size();
378
500k
        } else {
379
239k
          nptr->strip  = "";
380
239k
          nptr->stripl = 0;
381
239k
        }
382
    
383
739k
        split(dp);
384
739k
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
739k
        if (dp.key != "0") {
387
738k
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
738k
          nptr->appndl = strlen(nptr->appnd);
389
738k
        } else {
390
1.56k
          nptr->appnd  = "";
391
1.56k
          nptr->appndl = 0;
392
1.56k
        }
393
    
394
739k
        split(dp);
395
739k
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
739k
        char * cond = iconv(dp.key);
398
739k
        int cond_len = normalize_cond_str(cond);
399
739k
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
739k
        if (nptr->stripl != 0) {
403
500k
          char * cc = cond;
404
500k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
500k
          if (cond_len < nptr->stripl || 
406
500k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
500k
        }
411
739k
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
739k
        if (affix_type == 'P')
416
9.06k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
730k
        else
418
730k
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
739k
      }
420
22.2k
    }
421
22.2k
    continue;
422
22.2k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
22.2k
  }
428
875
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
875
  process_pfx_order();
459
875
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
875
  return no_err;
464
465
875
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
9.06k
{
474
9.06k
  PfxEntry * ptr;
475
9.06k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
9.06k
  const char * key = ep->key();
479
9.06k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
9.06k
  ptr = pFlag[flg];
483
9.06k
  ep->flag_next = ptr;
484
9.06k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
9.06k
  byte sp = *((const byte *)key);
489
9.06k
  ptr = pStart[sp];
490
9.06k
  ep->next = ptr;
491
9.06k
  pStart[sp] = ep;
492
9.06k
  return no_err;
493
9.06k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
730k
{
501
730k
  SfxEntry * ptr;
502
730k
  SfxEntry * ep = sfxptr;
503
730k
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
730k
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
730k
  char * dest = tmp + sfxptr->appndl;
508
730k
  *dest-- = 0;
509
730k
  const char * src = sfxptr->appnd;
510
4.97M
  for (; dest >= tmp; --dest, ++src)
511
4.24M
    *dest = *src;
512
513
  /* get the right starting point */
514
730k
  const char * key = ep->key();
515
730k
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
730k
  ptr = sFlag[flg];
519
730k
  ep->flag_next = ptr;
520
730k
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
730k
  byte sp = *((const byte *)key);
525
730k
  ptr = sStart[sp];
526
730k
  ep->next = ptr;
527
730k
  sStart[sp] = ep;
528
730k
  return no_err;
529
730k
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
875
{
536
875
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
224k
  for (int i=1; i < SETSIZE; i++) {
540
541
223k
    ptr = pStart[i];
542
543
223k
    if (ptr && ptr->next)
544
1.17k
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
232k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
9.06k
      PfxEntry * nptr = ptr->next;
556
12.4k
      for (; nptr != NULL; nptr = nptr->next) {
557
6.65k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
6.65k
      }
559
9.06k
      ptr->next_ne = nptr;
560
9.06k
      ptr->next_eq = NULL;
561
9.06k
      if ((ptr->next) && isSubset(ptr->key() , 
562
3.57k
                                  (ptr->next)->key())) 
563
837
        ptr->next_eq = ptr->next;
564
9.06k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
223k
    ptr = pStart[i];
572
232k
    for (; ptr != NULL; ptr = ptr->next) {
573
9.06k
      PfxEntry * nptr = ptr->next;
574
9.06k
      PfxEntry * mptr = NULL;
575
12.4k
      for (; nptr != NULL; nptr = nptr->next) {
576
6.65k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
3.42k
        mptr = nptr;
578
3.42k
      }
579
9.06k
      if (mptr) mptr->next_ne = NULL;
580
9.06k
    }
581
223k
  }
582
875
  return no_err;
583
875
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
875
{
590
875
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
224k
  for (int i=1; i < SETSIZE; i++) {
594
595
223k
    ptr = sStart[i];
596
597
223k
    if (ptr && ptr->next)
598
7.51k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
952k
    for (; ptr != NULL; ptr = ptr->next) {
608
729k
      SfxEntry * nptr = ptr->next;
609
122M
      for (; nptr != NULL; nptr = nptr->next) {
610
122M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
122M
      }
612
729k
      ptr->next_ne = nptr;
613
729k
      ptr->next_eq = NULL;
614
729k
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
593k
        ptr->next_eq = ptr->next;
616
729k
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
223k
    ptr = sStart[i];
625
952k
    for (; ptr != NULL; ptr = ptr->next) {
626
729k
      SfxEntry * nptr = ptr->next;
627
729k
      SfxEntry * mptr = NULL;
628
122M
      for (; nptr != NULL; nptr = nptr->next) {
629
122M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
121M
        mptr = nptr;
631
121M
      }
632
729k
      if (mptr) mptr->next_ne = NULL;
633
729k
    }
634
223k
  }
635
875
  return no_err;
636
875
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
739k
{
646
739k
  byte c;
647
739k
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
739k
  CondsLookup::iterator itr = l.find(cs);
652
739k
  if (!(itr == l.end())) {
653
720k
    ptr->conds = *itr;
654
720k
    return;
655
720k
  }
656
657
19.3k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
19.3k
  cds->str = buf.dup(cs);
659
19.3k
  l.insert(cds);
660
19.3k
  ptr->conds = cds;
661
662
19.3k
  int nc = strlen(cs);
663
19.3k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
19.3k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
19.3k
  int neg = 0;   // complement indicator
671
19.3k
  int grp = 0;   // group indicator
672
19.3k
  int n = 0;     // number of conditions
673
19.3k
  int ec = 0;    // end condition indicator
674
19.3k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
19.3k
  if (strcmp(cs,".")==0) {
678
875
    cds->num = 0;
679
875
    return;
680
875
  }
681
682
18.5k
  i = 0;
683
111k
  while (i < nc) {
684
93.4k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
93.4k
    if (c == '[') {
688
9.65k
      grp = 1;
689
9.65k
      c = 0;
690
9.65k
    }
691
692
    // complement flag
693
93.4k
    if ((grp == 1) && (c == '^')) {
694
6.96k
      neg = 1;
695
6.96k
      c = 0;
696
6.96k
    }
697
698
    // end goup indicator
699
93.4k
    if (c == ']') {
700
9.65k
      ec = 1;
701
9.65k
      c = 0;
702
9.65k
    }
703
704
    // add character of group to list
705
93.4k
    if ((grp == 1) && (c != 0)) {
706
33.8k
      *(mbr + nm) = c;
707
33.8k
      nm++;
708
33.8k
      c = 0;
709
33.8k
    }
710
711
    // end of condition 
712
93.4k
    if (c != 0) {
713
33.3k
      ec = 1;
714
33.3k
    }
715
716
    
717
93.4k
    if (ec) {
718
42.9k
      if (grp == 1) {
719
9.65k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
13.9k
          for (j=0;j<nm;j++) {
722
11.2k
            k = (unsigned int) mbr[j];
723
11.2k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
11.2k
          }
725
6.96k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
1.78M
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
29.6k
          for (j=0;j<nm;j++) {
729
22.6k
            k = (unsigned int) mbr[j];
730
22.6k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
22.6k
          }
732
6.96k
        }
733
9.65k
        neg = 0;
734
9.65k
        grp = 0;   
735
9.65k
        nm = 0;
736
33.3k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
33.3k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
33.3k
        } else {  
743
33.3k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
33.3k
        }
745
33.3k
      }
746
42.9k
      n++;
747
42.9k
      ec = 0;
748
42.9k
    }
749
750
751
93.4k
    i++;
752
93.4k
  }
753
18.5k
  cds->num = n;
754
18.5k
  return;
755
19.3k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
729k
{
762
729k
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
729k
  PfxEntry * pe = pStart[0];
766
729k
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
729k
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
729k
  PfxEntry * pptr = pStart[sp];
774
775
1.75M
  while (pptr) {
776
1.03M
    if (isSubset(pptr->key(),word)) {
777
82.0k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
82.0k
      pptr = pptr->next_eq;
779
948k
    } else {
780
948k
      pptr = pptr->next_ne;
781
948k
    }
782
1.03M
  }
783
    
784
729k
  return false;
785
729k
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
829k
{
793
829k
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
829k
  SfxEntry * se = sStart[0];
797
34.7M
  while (se) {
798
33.9M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
33.9M
    se = se->next;
800
33.9M
  }
801
  
802
  // now handle the general case
803
828k
  byte sp = *((const byte *)(word + word.size() - 1));
804
828k
  SfxEntry * sptr = sStart[sp];
805
806
12.7M
  while (sptr) {
807
11.9M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
7.72M
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
7.72M
      sptr = sptr->next_eq;
810
7.72M
    } else {
811
4.22M
      sptr = sptr->next_ne;
812
4.22M
    }
813
11.9M
  }
814
    
815
827k
  return false;
816
828k
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
728k
{
822
728k
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
728k
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
728k
  ParmString pword = word;
827
728k
  ParmString sword = word;
828
728k
  CharVector lower;
829
728k
  if (cp == FirstUpper) {
830
36.4k
    lower.append(word, word.size() + 1);
831
36.4k
    lower[0] = lang->to_lower(word[0]);
832
36.4k
    pword = ParmString(lower.data(), lower.size() - 1);
833
691k
  } else if (cp == AllUpper) {
834
103k
    lower.resize(word.size() + 1);
835
103k
    unsigned int i = 0;
836
1.77M
    for (; i != word.size(); ++i)
837
1.66M
      lower[i] = lang->to_lower(word[i]);
838
103k
    lower[i] = '\0';
839
103k
    pword = ParmString(lower.data(), lower.size() - 1);
840
103k
    sword = pword;
841
103k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
728k
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
728k
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
726k
  if (cp == FirstUpper) {
853
36.4k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
689k
  } else {
855
689k
    return false;
856
689k
  }
857
726k
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
1.36k
{
861
1.36k
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
1.36k
  CheckInfo ci;
863
1.36k
  gi->reset();
864
1.36k
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
1.36k
  if (cp == AllUpper) return;
866
1.34k
  if (cp != FirstUpper)
867
1.13k
    prefix_check(li, word, ci, gi, cross);
868
1.34k
  suffix_check(li, word, ci, gi, 0, NULL);
869
1.34k
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
254k
{
874
254k
  byte * empty = (byte *)buf.alloc(1);
875
254k
  *empty = 0;
876
877
254k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
254k
  byte * suf_e = suf;
879
254k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
254k
  byte * csuf_e = csuf;
881
882
254k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
254k
  WordAff * cur = head;
884
254k
  cur->word = buf.dup(word);
885
254k
  cur->aff  = suf;
886
887
254k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
619k
       c != end; 
889
364k
       ++c) 
890
364k
  {
891
364k
    if (sFlag[*c]) *suf_e++ = *c; 
892
364k
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
507k
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
142k
      SimpleString newword = p->add(word, buf);
896
142k
      if (!newword) continue;
897
61.6k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
61.6k
      cur = cur->next;
899
61.6k
      cur->word = newword;
900
61.6k
      cur->aff = p->allow_cross() ? csuf : empty;
901
61.6k
    }
902
364k
  }
903
904
254k
  *suf_e = 0;
905
254k
  *csuf_e = 0;
906
254k
  cur->next = 0;
907
908
254k
  if (limit == 0) return head;
909
910
254k
  WordAff * * end = &cur->next;
911
254k
  WordAff * * very_end = end;
912
254k
  size_t nsuf_s = suf_e - suf + 1;
913
914
571k
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
316k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
316k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
316k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
316k
    (*cur)->aff = nsuf;
919
316k
  }
920
921
254k
  return head;
922
254k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
316k
{
929
316k
  WordAff * head = 0;
930
316k
  if (l) head = **l;
931
316k
  WordAff * * cur = l ? *l : &head;
932
316k
  bool expanded     = false;
933
316k
  bool not_expanded = false;
934
316k
  if (!orig_word) orig_word = word;
935
936
694k
  while (*aff) {
937
378k
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
10.2M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
9.85M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
9.85M
        if (!newword) continue;
941
2.00M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
2.00M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
2.00M
        (*cur)->word = newword;
944
2.00M
        (*cur)->aff  = (const byte *)EMPTY;
945
2.00M
        cur = &(*cur)->next;
946
2.00M
        expanded = true;
947
2.00M
      }
948
378k
    }
949
378k
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
378k
    ++aff;
951
378k
  }
952
316k
  *cur = 0;
953
316k
  if (new_aff) *new_aff = 0;
954
316k
  if (l) *l = cur;
955
316k
  return head;
956
316k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
785k
{
986
785k
  SpellerImpl::WS::const_iterator i = begin;
987
785k
  const char * g = 0;
988
785k
  if (mode == Word) {
989
308k
    do {
990
308k
      (*i)->lookup(word, c, o);
991
312k
      for (;!o.at_end(); o.adv()) {
992
4.36k
        if (TESTAFF(o.aff, achar))
993
79
          return 1;
994
4.28k
        else
995
4.28k
          g = o.word;
996
4.36k
      }
997
308k
      ++i;
998
308k
    } while (i != end);
999
626k
  } else if (mode == Clean) {
1000
625k
    do {
1001
625k
      (*i)->clean_lookup(word, o);
1002
653k
      for (;!o.at_end(); o.adv()) {
1003
30.8k
        if (TESTAFF(o.aff, achar))
1004
2.17k
          return 1;
1005
28.7k
        else
1006
28.7k
          g = o.word;
1007
30.8k
      }
1008
623k
      ++i;
1009
623k
    } while (i != end);
1010
625k
  } else if (gi) {
1011
1.21k
    g = gi->dup(word);
1012
1.21k
  }
1013
783k
  if (gi && g) {
1014
3.16k
    CheckInfo * ci = gi->add();
1015
3.16k
    ci->word = g;
1016
3.16k
    return -1;
1017
3.16k
  }
1018
780k
  return 0;
1019
783k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
142k
{
1044
142k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
142k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
142k
    const byte * cp = (const byte *) word.str;
1048
174k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
113k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
81.0k
        break;
1051
113k
    }
1052
142k
    if (cond >= conds->num) {
1053
      /* */
1054
61.6k
      int alen = word.size - stripl;
1055
61.6k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
61.6k
      if (appndl) memcpy(newword, appnd, appndl);
1057
61.6k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
61.6k
      return SimpleString(newword, alen + appndl);
1059
61.6k
    }
1060
142k
  }
1061
81.0k
  return SimpleString();
1062
142k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
82.0k
{
1069
82.0k
  unsigned int    cond; // condition number being examined
1070
82.0k
  unsigned              tmpl;   // length of tmpword
1071
82.0k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
82.0k
  byte *  cp;   
1073
82.0k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
82.0k
  tmpl = word.size() - appndl;
1081
1082
82.0k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
80.3k
    if (stripl) strcpy (tmpword, strip);
1088
80.3k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
80.3k
    cp = (byte *)tmpword;
1096
82.7k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
19.2k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
19.2k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
80.3k
    if (cond >= conds->num) {
1104
63.4k
      CheckInfo * lci = 0;
1105
63.4k
      CheckInfo * guess = 0;
1106
63.4k
      tmpl += stripl;
1107
1108
63.4k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
63.4k
      if (res == 1) {
1111
1112
43
        lci = &ci;
1113
43
        lci->word = wordinfo.word;
1114
43
        goto quit;
1115
        
1116
63.4k
      } else if (res == -1) {
1117
1118
522
        guess = gi->head;
1119
1120
522
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
63.4k
      if (gi)
1127
5.61k
        lci = gi->head;
1128
      
1129
63.4k
      if (cross && xpflg & XPRODUCT) {
1130
63.4k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
63.4k
                                 ci, gi,
1132
63.4k
                                 XPRODUCT, (AffEntry *)this)) {
1133
2
          lci = &ci;
1134
          
1135
63.4k
        } else if (gi) {
1136
          
1137
5.61k
          CheckInfo * stop = lci;
1138
5.61k
          for (lci = gi->head; 
1139
5.77k
               lci != stop; 
1140
5.61k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
157
          {
1142
157
            lci->pre_flag = achar;
1143
157
            lci->pre_strip_len = stripl;
1144
157
            lci->pre_add_len = appndl;
1145
157
            lci->pre_add = appnd;
1146
157
          }
1147
          
1148
57.8k
        } else {
1149
          
1150
57.8k
          lci = 0;
1151
          
1152
57.8k
        }
1153
63.4k
      }
1154
    
1155
63.4k
      if (guess)
1156
522
        lci = guess;
1157
      
1158
63.4k
    quit:
1159
63.4k
      if (lci) {
1160
4.05k
        lci->pre_flag = achar;
1161
4.05k
        lci->pre_strip_len = stripl;
1162
4.05k
        lci->pre_add_len = appndl;
1163
4.05k
        lci->pre_add = appnd;
1164
4.05k
      }
1165
63.4k
      if (lci == &ci) return true;
1166
63.4k
    }
1167
80.3k
  }
1168
82.0k
  return false;
1169
82.0k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
9.85M
{
1190
9.85M
  int cond;
1191
  /* make sure all conditions match */
1192
9.85M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
7.94M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
15.1M
    for (cond = conds->num; --cond >=0; ) {
1195
13.1M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
5.94M
        break;
1197
13.1M
    }
1198
7.94M
    if (cond < 0) {
1199
2.00M
      int alen = word.size - stripl;
1200
2.00M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
2.00M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
2.00M
      memcpy(newword, word, alen);
1204
2.00M
      memcpy(newword + alen, appnd, appndl + 1);
1205
2.00M
      return SimpleString(newword, alen + appndl);
1206
2.00M
    }
1207
7.94M
  }
1208
7.84M
  return SimpleString();
1209
9.85M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
41.6M
{
1216
41.6M
  unsigned              tmpl;    // length of tmpword 
1217
41.6M
  int     cond;    // condition beng examined
1218
41.6M
  WordEntry             wordinfo;        // hash entry pointer
1219
41.6M
  byte *  cp;
1220
41.6M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
41.6M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
41.6M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
7
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
41.6M
  tmpl = word.size() - appndl;
1235
1236
41.6M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
29.8M
    strcpy (tmpword, word);
1243
29.8M
    cp = (byte *)(tmpword + tmpl);
1244
29.8M
    if (stripl) {
1245
28.8M
      strcpy ((char *)cp, strip);
1246
28.8M
      tmpl += stripl;
1247
28.8M
      cp = (byte *)(tmpword + tmpl);
1248
28.8M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
65.7M
    for (cond = conds->num;  --cond >= 0; ) {
1256
65.0M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
65.0M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
29.8M
    if (cond < 0) {
1263
722k
      CheckInfo * lci = 0;
1264
722k
      tmpl += stripl;
1265
722k
      const SensitiveCompare * cmp = 
1266
722k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
722k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
722k
      if (res == 1
1269
2.20k
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
2.10k
      {
1271
2.10k
        lci = &ci;
1272
2.10k
        lci->word = wordinfo.word;
1273
720k
      } else if (res == 1 && gi) {
1274
18
        lci = gi->add();
1275
18
        lci->word = wordinfo.word;
1276
720k
      } else if (res == -1) { // gi must be defined
1277
2.64k
        lci = gi->head;
1278
2.64k
      }
1279
1280
722k
      if (lci) {
1281
4.76k
        lci->suf_flag = achar;
1282
4.76k
        lci->suf_strip_len = stripl;
1283
4.76k
        lci->suf_add_len = appndl;
1284
4.76k
        lci->suf_add = appnd;
1285
4.76k
      }
1286
      
1287
722k
      if (lci == &ci) return true;
1288
722k
    }
1289
29.8M
  }
1290
41.6M
  return false;
1291
41.6M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
875
{
1303
875
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
875
  String file;
1307
875
  file += lang->data_dir();
1308
875
  file += '/';
1309
875
  file += lang->name();
1310
875
  file += "_affix.dat";
1311
875
  AffixMgr * affix;
1312
875
  affix = new AffixMgr(lang);
1313
875
  PosibErrBase pe = affix->setup(file, iconv);
1314
875
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
875
  } else {
1318
875
    return affix;
1319
875
  }
1320
875
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/