Coverage Report

Created: 2025-08-28 06:20

/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
139M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
11.6k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
170k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
11.6k
  inline byte flag() const { return achar;  }
110
2.48M
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
1.04M
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
854k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
1.04M
  inline byte flag() const { return achar;  }
133
745M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
357M
{
146
1.50G
  while( *s1 && (*s1 == *s2) ) {
147
1.14G
    s1++;
148
1.14G
    s2++;
149
1.14G
  }
150
357M
  return (*s1 == '\0');
151
357M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
11.2M
{
156
26.2M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
15.0M
    s1++;
158
15.0M
    end_of_s2--;
159
15.0M
    len --;
160
15.0M
  }
161
11.2M
  return (*s1 == '\0');
162
11.2M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
11.2M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
11.1k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
11.2M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
1.09M
  size_t hash(const char * s) {return hfun(s);}
198
1.63M
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
1.67M
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
1.05M
{
208
1.05M
  char * s = str;
209
1.05M
  char * d = str;
210
5.42M
  while (*s) {
211
4.36M
    if (*s != '[') {
212
4.23M
      *d++ = *s++;
213
4.23M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
132k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
132k
    } else {
219
132k
      *d++ = *s++;
220
132k
      if (*s == '^') *d++ = *s++;
221
503k
      while (*s != ']') {
222
370k
        if (*s == '\0' || *s == '[') return -1;
223
370k
        char * min = s;
224
801k
        for (char * i = s + 1; *i != ']'; ++i) {
225
430k
          if ((byte)*i < (byte)*min) min = i;}
226
370k
        char c = *s;
227
370k
        *d++ = *min;
228
370k
        *min = c;
229
370k
        ++s;
230
370k
      }
231
132k
      *d++ = *s++;
232
132k
    }
233
4.36M
  }
234
1.05M
  *d = '\0';
235
1.05M
  return d - str;
236
1.05M
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
1.06k
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
1.06k
  max_strip_ = 0;
251
272k
  for (int i=0; i < SETSIZE; i++) {
252
271k
    pStart[i] = NULL;
253
271k
    sStart[i] = NULL;
254
271k
    pFlag[i] = NULL;
255
271k
    sFlag[i] = NULL;
256
271k
    max_strip_f[i] = 0;
257
271k
  }
258
1.06k
  return parse_file(affpath, iconv);
259
1.06k
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
1.06k
  : lang(l), data_buf(1024*16) {}
263
264
1.06k
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
1.44M
{
268
1.44M
  if (lhs < rhs) lhs = rhs;
269
1.44M
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
1.06k
{
274
  // io buffers
275
1.06k
  String buf; DataPair dp;
276
277
1.06k
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
1.06k
  affix_file = data_buf.dup(affpath);
281
1.06k
  FStream afflst;
282
1.06k
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
1.06k
  char prev_aff = '\0';
291
292
166k
  while (getdata_pair(afflst,dp,buf)) {
293
164k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
164k
    if (dp.key == "SET") {
298
1.06k
      String buf;
299
1.06k
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
1.06k
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
1.06k
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
163k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
27.5k
      affix_type = dp.key[0];
316
317
164k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
27.5k
    int numents = 0;      // number of affentry structures to parse
324
27.5k
    char achar='\0';      // affix char identifier
325
27.5k
    short xpflg=0;
326
27.5k
    AffEntry * nptr;
327
27.5k
    {
328
      // split affix header line into pieces
329
27.5k
      split(dp);
330
27.5k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
27.5k
      const char * astr = iconv(dp.key);
333
27.5k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
27.5k
      achar = astr[0];
335
27.5k
      if (achar == prev_aff) goto error_count;
336
27.5k
      prev_aff = achar;
337
338
27.5k
      split(dp);
339
27.5k
      if (dp.key.size != 1 || 
340
27.5k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
27.5k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
27.5k
      split(dp);
345
27.5k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
27.5k
      numents = atoi(dp.key); 
349
  
350
1.08M
      for (int j = 0; j < numents; j++) {
351
1.05M
        getdata_pair(afflst, dp, buf);
352
353
1.05M
        if (affix_type == 'P') {
354
11.6k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
11.6k
          new (nptr) PfxEntry;
356
1.04M
        } else {
357
1.04M
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
1.04M
          new (nptr) SfxEntry;
359
1.04M
        }
360
361
1.05M
        nptr->xpflg = xpflg;
362
363
1.05M
        split(dp);
364
1.05M
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
1.05M
        if (iconv(dp.key)[0] != achar) goto error_count;
367
1.05M
        nptr->achar = achar;
368
 
369
1.05M
        split(dp);
370
1.05M
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
1.05M
        if (dp.key != "0") {
373
720k
          ParmString s0(iconv(dp.key));
374
720k
          max_(max_strip_, s0.size());
375
720k
          max_(max_strip_f[(byte)achar], s0.size());
376
720k
          nptr->strip = data_buf.dup(s0);
377
720k
          nptr->stripl = s0.size();
378
720k
        } else {
379
338k
          nptr->strip  = "";
380
338k
          nptr->stripl = 0;
381
338k
        }
382
    
383
1.05M
        split(dp);
384
1.05M
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
1.05M
        if (dp.key != "0") {
387
1.05M
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
1.05M
          nptr->appndl = strlen(nptr->appnd);
389
1.05M
        } else {
390
2.26k
          nptr->appnd  = "";
391
2.26k
          nptr->appndl = 0;
392
2.26k
        }
393
    
394
1.05M
        split(dp);
395
1.05M
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
1.05M
        char * cond = iconv(dp.key);
398
1.05M
        int cond_len = normalize_cond_str(cond);
399
1.05M
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
1.05M
        if (nptr->stripl != 0) {
403
720k
          char * cc = cond;
404
720k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
720k
          if (cond_len < nptr->stripl || 
406
720k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
720k
        }
411
1.05M
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
1.05M
        if (affix_type == 'P')
416
11.6k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
1.04M
        else
418
1.04M
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
1.05M
      }
420
27.5k
    }
421
27.5k
    continue;
422
27.5k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
27.5k
  }
428
1.06k
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
1.06k
  process_pfx_order();
459
1.06k
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
1.06k
  return no_err;
464
465
1.06k
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
11.6k
{
474
11.6k
  PfxEntry * ptr;
475
11.6k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
11.6k
  const char * key = ep->key();
479
11.6k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
11.6k
  ptr = pFlag[flg];
483
11.6k
  ep->flag_next = ptr;
484
11.6k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
11.6k
  byte sp = *((const byte *)key);
489
11.6k
  ptr = pStart[sp];
490
11.6k
  ep->next = ptr;
491
11.6k
  pStart[sp] = ep;
492
11.6k
  return no_err;
493
11.6k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
1.04M
{
501
1.04M
  SfxEntry * ptr;
502
1.04M
  SfxEntry * ep = sfxptr;
503
1.04M
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
1.04M
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
1.04M
  char * dest = tmp + sfxptr->appndl;
508
1.04M
  *dest-- = 0;
509
1.04M
  const char * src = sfxptr->appnd;
510
7.14M
  for (; dest >= tmp; --dest, ++src)
511
6.09M
    *dest = *src;
512
513
  /* get the right starting point */
514
1.04M
  const char * key = ep->key();
515
1.04M
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
1.04M
  ptr = sFlag[flg];
519
1.04M
  ep->flag_next = ptr;
520
1.04M
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
1.04M
  byte sp = *((const byte *)key);
525
1.04M
  ptr = sStart[sp];
526
1.04M
  ep->next = ptr;
527
1.04M
  sStart[sp] = ep;
528
1.04M
  return no_err;
529
1.04M
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
1.06k
{
536
1.06k
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
271k
  for (int i=1; i < SETSIZE; i++) {
540
541
270k
    ptr = pStart[i];
542
543
270k
    if (ptr && ptr->next)
544
1.49k
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
282k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
11.6k
      PfxEntry * nptr = ptr->next;
556
16.6k
      for (; nptr != NULL; nptr = nptr->next) {
557
9.40k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
9.40k
      }
559
11.6k
      ptr->next_ne = nptr;
560
11.6k
      ptr->next_eq = NULL;
561
11.6k
      if ((ptr->next) && isSubset(ptr->key() , 
562
4.96k
                                  (ptr->next)->key())) 
563
1.20k
        ptr->next_eq = ptr->next;
564
11.6k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
270k
    ptr = pStart[i];
572
282k
    for (; ptr != NULL; ptr = ptr->next) {
573
11.6k
      PfxEntry * nptr = ptr->next;
574
11.6k
      PfxEntry * mptr = NULL;
575
16.6k
      for (; nptr != NULL; nptr = nptr->next) {
576
9.40k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
4.95k
        mptr = nptr;
578
4.95k
      }
579
11.6k
      if (mptr) mptr->next_ne = NULL;
580
11.6k
    }
581
270k
  }
582
1.06k
  return no_err;
583
1.06k
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
1.06k
{
590
1.06k
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
271k
  for (int i=1; i < SETSIZE; i++) {
594
595
270k
    ptr = sStart[i];
596
597
270k
    if (ptr && ptr->next)
598
9.23k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
1.31M
    for (; ptr != NULL; ptr = ptr->next) {
608
1.04M
      SfxEntry * nptr = ptr->next;
609
177M
      for (; nptr != NULL; nptr = nptr->next) {
610
177M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
177M
      }
612
1.04M
      ptr->next_ne = nptr;
613
1.04M
      ptr->next_eq = NULL;
614
1.04M
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
852k
        ptr->next_eq = ptr->next;
616
1.04M
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
270k
    ptr = sStart[i];
625
1.31M
    for (; ptr != NULL; ptr = ptr->next) {
626
1.04M
      SfxEntry * nptr = ptr->next;
627
1.04M
      SfxEntry * mptr = NULL;
628
177M
      for (; nptr != NULL; nptr = nptr->next) {
629
177M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
176M
        mptr = nptr;
631
176M
      }
632
1.04M
      if (mptr) mptr->next_ne = NULL;
633
1.04M
    }
634
270k
  }
635
1.06k
  return no_err;
636
1.06k
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
1.05M
{
646
1.05M
  byte c;
647
1.05M
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
1.05M
  CondsLookup::iterator itr = l.find(cs);
652
1.05M
  if (!(itr == l.end())) {
653
1.03M
    ptr->conds = *itr;
654
1.03M
    return;
655
1.03M
  }
656
657
25.3k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
25.3k
  cds->str = buf.dup(cs);
659
25.3k
  l.insert(cds);
660
25.3k
  ptr->conds = cds;
661
662
25.3k
  int nc = strlen(cs);
663
25.3k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
25.3k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
25.3k
  int neg = 0;   // complement indicator
671
25.3k
  int grp = 0;   // group indicator
672
25.3k
  int n = 0;     // number of conditions
673
25.3k
  int ec = 0;    // end condition indicator
674
25.3k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
25.3k
  if (strcmp(cs,".")==0) {
678
1.06k
    cds->num = 0;
679
1.06k
    return;
680
1.06k
  }
681
682
24.3k
  i = 0;
683
146k
  while (i < nc) {
684
121k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
121k
    if (c == '[') {
688
12.1k
      grp = 1;
689
12.1k
      c = 0;
690
12.1k
    }
691
692
    // complement flag
693
121k
    if ((grp == 1) && (c == '^')) {
694
8.64k
      neg = 1;
695
8.64k
      c = 0;
696
8.64k
    }
697
698
    // end goup indicator
699
121k
    if (c == ']') {
700
12.1k
      ec = 1;
701
12.1k
      c = 0;
702
12.1k
    }
703
704
    // add character of group to list
705
121k
    if ((grp == 1) && (c != 0)) {
706
42.2k
      *(mbr + nm) = c;
707
42.2k
      nm++;
708
42.2k
      c = 0;
709
42.2k
    }
710
711
    // end of condition 
712
121k
    if (c != 0) {
713
46.7k
      ec = 1;
714
46.7k
    }
715
716
    
717
121k
    if (ec) {
718
58.8k
      if (grp == 1) {
719
12.1k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
17.8k
          for (j=0;j<nm;j++) {
722
14.3k
            k = (unsigned int) mbr[j];
723
14.3k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
14.3k
          }
725
8.64k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
2.22M
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
36.5k
          for (j=0;j<nm;j++) {
729
27.8k
            k = (unsigned int) mbr[j];
730
27.8k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
27.8k
          }
732
8.64k
        }
733
12.1k
        neg = 0;
734
12.1k
        grp = 0;   
735
12.1k
        nm = 0;
736
46.7k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
46.7k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
46.7k
        } else {  
743
46.7k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
46.7k
        }
745
46.7k
      }
746
58.8k
      n++;
747
58.8k
      ec = 0;
748
58.8k
    }
749
750
751
121k
    i++;
752
121k
  }
753
24.3k
  cds->num = n;
754
24.3k
  return;
755
25.3k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
1.17M
{
762
1.17M
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
1.17M
  PfxEntry * pe = pStart[0];
766
1.17M
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
1.17M
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
1.17M
  PfxEntry * pptr = pStart[sp];
774
775
3.57M
  while (pptr) {
776
2.40M
    if (isSubset(pptr->key(),word)) {
777
549k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
548k
      pptr = pptr->next_eq;
779
1.85M
    } else {
780
1.85M
      pptr = pptr->next_ne;
781
1.85M
    }
782
2.40M
  }
783
    
784
1.17M
  return false;
785
1.17M
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
1.37M
{
793
1.37M
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
1.37M
  SfxEntry * se = sStart[0];
797
67.6M
  while (se) {
798
66.2M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
66.2M
    se = se->next;
800
66.2M
  }
801
  
802
  // now handle the general case
803
1.37M
  byte sp = *((const byte *)(word + word.size() - 1));
804
1.37M
  SfxEntry * sptr = sStart[sp];
805
806
12.5M
  while (sptr) {
807
11.2M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
5.70M
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
5.70M
      sptr = sptr->next_eq;
810
5.70M
    } else {
811
5.50M
      sptr = sptr->next_ne;
812
5.50M
    }
813
11.2M
  }
814
    
815
1.36M
  return false;
816
1.37M
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
1.17M
{
822
1.17M
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
1.17M
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
1.17M
  ParmString pword = word;
827
1.17M
  ParmString sword = word;
828
1.17M
  CharVector lower;
829
1.17M
  if (cp == FirstUpper) {
830
22.2k
    lower.append(word, word.size() + 1);
831
22.2k
    lower[0] = lang->to_lower(word[0]);
832
22.2k
    pword = ParmString(lower.data(), lower.size() - 1);
833
1.14M
  } else if (cp == AllUpper) {
834
55.2k
    lower.resize(word.size() + 1);
835
55.2k
    unsigned int i = 0;
836
898k
    for (; i != word.size(); ++i)
837
843k
      lower[i] = lang->to_lower(word[i]);
838
55.2k
    lower[i] = '\0';
839
55.2k
    pword = ParmString(lower.data(), lower.size() - 1);
840
55.2k
    sword = pword;
841
55.2k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
1.17M
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
1.17M
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
1.16M
  if (cp == FirstUpper) {
853
22.2k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
1.14M
  } else {
855
1.14M
    return false;
856
1.14M
  }
857
1.16M
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
2.73k
{
861
2.73k
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
2.73k
  CheckInfo ci;
863
2.73k
  gi->reset();
864
2.73k
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
2.73k
  if (cp == AllUpper) return;
866
2.69k
  if (cp != FirstUpper)
867
2.65k
    prefix_check(li, word, ci, gi, cross);
868
2.69k
  suffix_check(li, word, ci, gi, 0, NULL);
869
2.69k
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
730k
{
874
730k
  byte * empty = (byte *)buf.alloc(1);
875
730k
  *empty = 0;
876
877
730k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
730k
  byte * suf_e = suf;
879
730k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
730k
  byte * csuf_e = csuf;
881
882
730k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
730k
  WordAff * cur = head;
884
730k
  cur->word = buf.dup(word);
885
730k
  cur->aff  = suf;
886
887
730k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
1.75M
       c != end; 
889
1.02M
       ++c) 
890
1.02M
  {
891
1.02M
    if (sFlag[*c]) *suf_e++ = *c; 
892
1.02M
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
1.41M
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
394k
      SimpleString newword = p->add(word, buf);
896
394k
      if (!newword) continue;
897
170k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
170k
      cur = cur->next;
899
170k
      cur->word = newword;
900
170k
      cur->aff = p->allow_cross() ? csuf : empty;
901
170k
    }
902
1.02M
  }
903
904
730k
  *suf_e = 0;
905
730k
  *csuf_e = 0;
906
730k
  cur->next = 0;
907
908
730k
  if (limit == 0) return head;
909
910
730k
  WordAff * * end = &cur->next;
911
730k
  WordAff * * very_end = end;
912
730k
  size_t nsuf_s = suf_e - suf + 1;
913
914
1.63M
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
900k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
900k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
900k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
900k
    (*cur)->aff = nsuf;
919
900k
  }
920
921
730k
  return head;
922
730k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
900k
{
929
900k
  WordAff * head = 0;
930
900k
  if (l) head = **l;
931
900k
  WordAff * * cur = l ? *l : &head;
932
900k
  bool expanded     = false;
933
900k
  bool not_expanded = false;
934
900k
  if (!orig_word) orig_word = word;
935
936
1.96M
  while (*aff) {
937
1.05M
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
26.5M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
25.4M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
25.4M
        if (!newword) continue;
941
4.64M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
4.64M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
4.64M
        (*cur)->word = newword;
944
4.64M
        (*cur)->aff  = (const byte *)EMPTY;
945
4.64M
        cur = &(*cur)->next;
946
4.64M
        expanded = true;
947
4.64M
      }
948
1.05M
    }
949
1.05M
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
1.05M
    ++aff;
951
1.05M
  }
952
900k
  *cur = 0;
953
900k
  if (new_aff) *new_aff = 0;
954
900k
  if (l) *l = cur;
955
900k
  return head;
956
900k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
700k
{
986
700k
  SpellerImpl::WS::const_iterator i = begin;
987
700k
  const char * g = 0;
988
700k
  if (mode == Word) {
989
171k
    do {
990
171k
      (*i)->lookup(word, c, o);
991
183k
      for (;!o.at_end(); o.adv()) {
992
12.6k
        if (TESTAFF(o.aff, achar))
993
150
          return 1;
994
12.5k
        else
995
12.5k
          g = o.word;
996
12.6k
      }
997
170k
      ++i;
998
170k
    } while (i != end);
999
621k
  } else if (mode == Clean) {
1000
621k
    do {
1001
621k
      (*i)->clean_lookup(word, o);
1002
696k
      for (;!o.at_end(); o.adv()) {
1003
77.4k
        if (TESTAFF(o.aff, achar))
1004
1.80k
          return 1;
1005
75.6k
        else
1006
75.6k
          g = o.word;
1007
77.4k
      }
1008
619k
      ++i;
1009
619k
    } while (i != end);
1010
621k
  } else if (gi) {
1011
893
    g = gi->dup(word);
1012
893
  }
1013
698k
  if (gi && g) {
1014
6.80k
    CheckInfo * ci = gi->add();
1015
6.80k
    ci->word = g;
1016
6.80k
    return -1;
1017
6.80k
  }
1018
691k
  return 0;
1019
698k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
394k
{
1044
394k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
394k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
394k
    const byte * cp = (const byte *) word.str;
1048
482k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
312k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
224k
        break;
1051
312k
    }
1052
394k
    if (cond >= conds->num) {
1053
      /* */
1054
170k
      int alen = word.size - stripl;
1055
170k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
170k
      if (appndl) memcpy(newword, appnd, appndl);
1057
170k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
170k
      return SimpleString(newword, alen + appndl);
1059
170k
    }
1060
394k
  }
1061
224k
  return SimpleString();
1062
394k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
549k
{
1069
549k
  unsigned int    cond; // condition number being examined
1070
549k
  unsigned              tmpl;   // length of tmpword
1071
549k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
549k
  byte *  cp;   
1073
549k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
549k
  tmpl = word.size() - appndl;
1081
1082
549k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
544k
    if (stripl) strcpy (tmpword, strip);
1088
544k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
544k
    cp = (byte *)tmpword;
1096
578k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
401k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
401k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
544k
    if (cond >= conds->num) {
1104
176k
      CheckInfo * lci = 0;
1105
176k
      CheckInfo * guess = 0;
1106
176k
      tmpl += stripl;
1107
1108
176k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
176k
      if (res == 1) {
1111
1112
323
        lci = &ci;
1113
323
        lci->word = wordinfo.word;
1114
323
        goto quit;
1115
        
1116
176k
      } else if (res == -1) {
1117
1118
2.14k
        guess = gi->head;
1119
1120
2.14k
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
176k
      if (gi)
1127
9.23k
        lci = gi->head;
1128
      
1129
176k
      if (cross && xpflg & XPRODUCT) {
1130
176k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
176k
                                 ci, gi,
1132
176k
                                 XPRODUCT, (AffEntry *)this)) {
1133
6
          lci = &ci;
1134
          
1135
176k
        } else if (gi) {
1136
          
1137
9.23k
          CheckInfo * stop = lci;
1138
9.23k
          for (lci = gi->head; 
1139
9.45k
               lci != stop; 
1140
9.23k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
219
          {
1142
219
            lci->pre_flag = achar;
1143
219
            lci->pre_strip_len = stripl;
1144
219
            lci->pre_add_len = appndl;
1145
219
            lci->pre_add = appnd;
1146
219
          }
1147
          
1148
166k
        } else {
1149
          
1150
166k
          lci = 0;
1151
          
1152
166k
        }
1153
176k
      }
1154
    
1155
176k
      if (guess)
1156
2.14k
        lci = guess;
1157
      
1158
176k
    quit:
1159
176k
      if (lci) {
1160
5.88k
        lci->pre_flag = achar;
1161
5.88k
        lci->pre_strip_len = stripl;
1162
5.88k
        lci->pre_add_len = appndl;
1163
5.88k
        lci->pre_add = appnd;
1164
5.88k
      }
1165
176k
      if (lci == &ci) return true;
1166
176k
    }
1167
544k
  }
1168
548k
  return false;
1169
549k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
25.4M
{
1190
25.4M
  int cond;
1191
  /* make sure all conditions match */
1192
25.4M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
19.9M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
36.2M
    for (cond = conds->num; --cond >=0; ) {
1195
31.6M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
15.3M
        break;
1197
31.6M
    }
1198
19.9M
    if (cond < 0) {
1199
4.64M
      int alen = word.size - stripl;
1200
4.64M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
4.64M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
4.64M
      memcpy(newword, word, alen);
1204
4.64M
      memcpy(newword + alen, appnd, appndl + 1);
1205
4.64M
      return SimpleString(newword, alen + appndl);
1206
4.64M
    }
1207
19.9M
  }
1208
20.7M
  return SimpleString();
1209
25.4M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
71.9M
{
1216
71.9M
  unsigned              tmpl;    // length of tmpword 
1217
71.9M
  int     cond;    // condition beng examined
1218
71.9M
  WordEntry             wordinfo;        // hash entry pointer
1219
71.9M
  byte *  cp;
1220
71.9M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
71.9M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
71.9M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
351
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
71.9M
  tmpl = word.size() - appndl;
1235
1236
71.9M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
49.4M
    strcpy (tmpword, word);
1243
49.4M
    cp = (byte *)(tmpword + tmpl);
1244
49.4M
    if (stripl) {
1245
48.8M
      strcpy ((char *)cp, strip);
1246
48.8M
      tmpl += stripl;
1247
48.8M
      cp = (byte *)(tmpword + tmpl);
1248
48.8M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
107M
    for (cond = conds->num;  --cond >= 0; ) {
1256
106M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
106M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
49.4M
    if (cond < 0) {
1263
523k
      CheckInfo * lci = 0;
1264
523k
      tmpl += stripl;
1265
523k
      const SensitiveCompare * cmp = 
1266
523k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
523k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
523k
      if (res == 1
1269
523k
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
1.50k
      {
1271
1.50k
        lci = &ci;
1272
1.50k
        lci->word = wordinfo.word;
1273
522k
      } else if (res == 1 && gi) {
1274
11
        lci = gi->add();
1275
11
        lci->word = wordinfo.word;
1276
522k
      } else if (res == -1) { // gi must be defined
1277
4.66k
        lci = gi->head;
1278
4.66k
      }
1279
1280
523k
      if (lci) {
1281
6.17k
        lci->suf_flag = achar;
1282
6.17k
        lci->suf_strip_len = stripl;
1283
6.17k
        lci->suf_add_len = appndl;
1284
6.17k
        lci->suf_add = appnd;
1285
6.17k
      }
1286
      
1287
523k
      if (lci == &ci) return true;
1288
523k
    }
1289
49.4M
  }
1290
71.9M
  return false;
1291
71.9M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
1.06k
{
1303
1.06k
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
1.06k
  String file;
1307
1.06k
  file += lang->data_dir();
1308
1.06k
  file += '/';
1309
1.06k
  file += lang->name();
1310
1.06k
  file += "_affix.dat";
1311
1.06k
  AffixMgr * affix;
1312
1.06k
  affix = new AffixMgr(lang);
1313
1.06k
  PosibErrBase pe = affix->setup(file, iconv);
1314
1.06k
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
1.06k
  } else {
1318
1.06k
    return affix;
1319
1.06k
  }
1320
1.06k
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/