Coverage Report

Created: 2025-07-18 06:42

/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
120M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
10.9k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
122k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
10.9k
  inline byte flag() const { return achar;  }
110
1.56M
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
917k
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
644k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
917k
  inline byte flag() const { return achar;  }
133
649M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
311M
{
146
1.30G
  while( *s1 && (*s1 == *s2) ) {
147
994M
    s1++;
148
994M
    s2++;
149
994M
  }
150
311M
  return (*s1 == '\0');
151
311M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
9.22M
{
156
20.6M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
11.4M
    s1++;
158
11.4M
    end_of_s2--;
159
11.4M
    len --;
160
11.4M
  }
161
9.22M
  return (*s1 == '\0');
162
9.22M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
9.82M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
9.80k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
9.81M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
963k
  size_t hash(const char * s) {return hfun(s);}
198
1.42M
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
1.46M
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
927k
{
208
927k
  char * s = str;
209
927k
  char * d = str;
210
4.73M
  while (*s) {
211
3.81M
    if (*s != '[') {
212
3.69M
      *d++ = *s++;
213
3.69M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
118k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
118k
    } else {
219
118k
      *d++ = *s++;
220
118k
      if (*s == '^') *d++ = *s++;
221
451k
      while (*s != ']') {
222
333k
        if (*s == '\0' || *s == '[') return -1;
223
333k
        char * min = s;
224
725k
        for (char * i = s + 1; *i != ']'; ++i) {
225
392k
          if ((byte)*i < (byte)*min) min = i;}
226
333k
        char c = *s;
227
333k
        *d++ = *min;
228
333k
        *min = c;
229
333k
        ++s;
230
333k
      }
231
118k
      *d++ = *s++;
232
118k
    }
233
3.81M
  }
234
927k
  *d = '\0';
235
927k
  return d - str;
236
927k
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
1.02k
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
1.02k
  max_strip_ = 0;
251
264k
  for (int i=0; i < SETSIZE; i++) {
252
263k
    pStart[i] = NULL;
253
263k
    sStart[i] = NULL;
254
263k
    pFlag[i] = NULL;
255
263k
    sFlag[i] = NULL;
256
263k
    max_strip_f[i] = 0;
257
263k
  }
258
1.02k
  return parse_file(affpath, iconv);
259
1.02k
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
1.02k
  : lang(l), data_buf(1024*16) {}
263
264
1.02k
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
1.25M
{
268
1.25M
  if (lhs < rhs) lhs = rhs;
269
1.25M
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
1.02k
{
274
  // io buffers
275
1.02k
  String buf; DataPair dp;
276
277
1.02k
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
1.02k
  affix_file = data_buf.dup(affpath);
281
1.02k
  FStream afflst;
282
1.02k
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
1.02k
  char prev_aff = '\0';
291
292
156k
  while (getdata_pair(afflst,dp,buf)) {
293
155k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
155k
    if (dp.key == "SET") {
298
1.02k
      String buf;
299
1.02k
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
1.02k
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
1.02k
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
154k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
26.3k
      affix_type = dp.key[0];
316
317
155k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
26.3k
    int numents = 0;      // number of affentry structures to parse
324
26.3k
    char achar='\0';      // affix char identifier
325
26.3k
    short xpflg=0;
326
26.3k
    AffEntry * nptr;
327
26.3k
    {
328
      // split affix header line into pieces
329
26.3k
      split(dp);
330
26.3k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
26.3k
      const char * astr = iconv(dp.key);
333
26.3k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
26.3k
      achar = astr[0];
335
26.3k
      if (achar == prev_aff) goto error_count;
336
26.3k
      prev_aff = achar;
337
338
26.3k
      split(dp);
339
26.3k
      if (dp.key.size != 1 || 
340
26.3k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
26.3k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
26.3k
      split(dp);
345
26.3k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
26.3k
      numents = atoi(dp.key); 
349
  
350
954k
      for (int j = 0; j < numents; j++) {
351
927k
        getdata_pair(afflst, dp, buf);
352
353
927k
        if (affix_type == 'P') {
354
10.9k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
10.9k
          new (nptr) PfxEntry;
356
917k
        } else {
357
917k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
917k
          new (nptr) SfxEntry;
359
917k
        }
360
361
927k
        nptr->xpflg = xpflg;
362
363
927k
        split(dp);
364
927k
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
927k
        if (iconv(dp.key)[0] != achar) goto error_count;
367
927k
        nptr->achar = achar;
368
 
369
927k
        split(dp);
370
927k
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
927k
        if (dp.key != "0") {
373
629k
          ParmString s0(iconv(dp.key));
374
629k
          max_(max_strip_, s0.size());
375
629k
          max_(max_strip_f[(byte)achar], s0.size());
376
629k
          nptr->strip = data_buf.dup(s0);
377
629k
          nptr->stripl = s0.size();
378
629k
        } else {
379
298k
          nptr->strip  = "";
380
298k
          nptr->stripl = 0;
381
298k
        }
382
    
383
927k
        split(dp);
384
927k
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
927k
        if (dp.key != "0") {
387
925k
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
925k
          nptr->appndl = strlen(nptr->appnd);
389
925k
        } else {
390
1.97k
          nptr->appnd  = "";
391
1.97k
          nptr->appndl = 0;
392
1.97k
        }
393
    
394
927k
        split(dp);
395
927k
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
927k
        char * cond = iconv(dp.key);
398
927k
        int cond_len = normalize_cond_str(cond);
399
927k
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
927k
        if (nptr->stripl != 0) {
403
629k
          char * cc = cond;
404
629k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
629k
          if (cond_len < nptr->stripl || 
406
629k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
629k
        }
411
927k
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
927k
        if (affix_type == 'P')
416
10.9k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
917k
        else
418
917k
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
927k
      }
420
26.3k
    }
421
26.3k
    continue;
422
26.3k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
26.3k
  }
428
1.02k
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
1.02k
  process_pfx_order();
459
1.02k
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
1.02k
  return no_err;
464
465
1.02k
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
10.9k
{
474
10.9k
  PfxEntry * ptr;
475
10.9k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
10.9k
  const char * key = ep->key();
479
10.9k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
10.9k
  ptr = pFlag[flg];
483
10.9k
  ep->flag_next = ptr;
484
10.9k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
10.9k
  byte sp = *((const byte *)key);
489
10.9k
  ptr = pStart[sp];
490
10.9k
  ep->next = ptr;
491
10.9k
  pStart[sp] = ep;
492
10.9k
  return no_err;
493
10.9k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
917k
{
501
917k
  SfxEntry * ptr;
502
917k
  SfxEntry * ep = sfxptr;
503
917k
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
917k
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
917k
  char * dest = tmp + sfxptr->appndl;
508
917k
  *dest-- = 0;
509
917k
  const char * src = sfxptr->appnd;
510
6.24M
  for (; dest >= tmp; --dest, ++src)
511
5.33M
    *dest = *src;
512
513
  /* get the right starting point */
514
917k
  const char * key = ep->key();
515
917k
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
917k
  ptr = sFlag[flg];
519
917k
  ep->flag_next = ptr;
520
917k
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
917k
  byte sp = *((const byte *)key);
525
917k
  ptr = sStart[sp];
526
917k
  ep->next = ptr;
527
917k
  sStart[sp] = ep;
528
917k
  return no_err;
529
917k
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
1.02k
{
536
1.02k
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
263k
  for (int i=1; i < SETSIZE; i++) {
540
541
262k
    ptr = pStart[i];
542
543
262k
    if (ptr && ptr->next)
544
1.40k
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
273k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
10.9k
      PfxEntry * nptr = ptr->next;
556
15.2k
      for (; nptr != NULL; nptr = nptr->next) {
557
8.30k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
8.30k
      }
559
10.9k
      ptr->next_ne = nptr;
560
10.9k
      ptr->next_eq = NULL;
561
10.9k
      if ((ptr->next) && isSubset(ptr->key() , 
562
4.42k
                                  (ptr->next)->key())) 
563
1.05k
        ptr->next_eq = ptr->next;
564
10.9k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
262k
    ptr = pStart[i];
572
273k
    for (; ptr != NULL; ptr = ptr->next) {
573
10.9k
      PfxEntry * nptr = ptr->next;
574
10.9k
      PfxEntry * mptr = NULL;
575
15.2k
      for (; nptr != NULL; nptr = nptr->next) {
576
8.30k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
4.31k
        mptr = nptr;
578
4.31k
      }
579
10.9k
      if (mptr) mptr->next_ne = NULL;
580
10.9k
    }
581
262k
  }
582
1.02k
  return no_err;
583
1.02k
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
1.02k
{
590
1.02k
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
263k
  for (int i=1; i < SETSIZE; i++) {
594
595
262k
    ptr = sStart[i];
596
597
262k
    if (ptr && ptr->next)
598
8.87k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
1.17M
    for (; ptr != NULL; ptr = ptr->next) {
608
915k
      SfxEntry * nptr = ptr->next;
609
154M
      for (; nptr != NULL; nptr = nptr->next) {
610
154M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
154M
      }
612
915k
      ptr->next_ne = nptr;
613
915k
      ptr->next_eq = NULL;
614
915k
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
745k
        ptr->next_eq = ptr->next;
616
915k
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
262k
    ptr = sStart[i];
625
1.17M
    for (; ptr != NULL; ptr = ptr->next) {
626
915k
      SfxEntry * nptr = ptr->next;
627
915k
      SfxEntry * mptr = NULL;
628
154M
      for (; nptr != NULL; nptr = nptr->next) {
629
154M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
153M
        mptr = nptr;
631
153M
      }
632
915k
      if (mptr) mptr->next_ne = NULL;
633
915k
    }
634
262k
  }
635
1.02k
  return no_err;
636
1.02k
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
927k
{
646
927k
  byte c;
647
927k
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
927k
  CondsLookup::iterator itr = l.find(cs);
652
927k
  if (!(itr == l.end())) {
653
904k
    ptr->conds = *itr;
654
904k
    return;
655
904k
  }
656
657
23.4k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
23.4k
  cds->str = buf.dup(cs);
659
23.4k
  l.insert(cds);
660
23.4k
  ptr->conds = cds;
661
662
23.4k
  int nc = strlen(cs);
663
23.4k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
23.4k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
23.4k
  int neg = 0;   // complement indicator
671
23.4k
  int grp = 0;   // group indicator
672
23.4k
  int n = 0;     // number of conditions
673
23.4k
  int ec = 0;    // end condition indicator
674
23.4k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
23.4k
  if (strcmp(cs,".")==0) {
678
1.02k
    cds->num = 0;
679
1.02k
    return;
680
1.02k
  }
681
682
22.4k
  i = 0;
683
135k
  while (i < nc) {
684
112k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
112k
    if (c == '[') {
688
11.5k
      grp = 1;
689
11.5k
      c = 0;
690
11.5k
    }
691
692
    // complement flag
693
112k
    if ((grp == 1) && (c == '^')) {
694
8.25k
      neg = 1;
695
8.25k
      c = 0;
696
8.25k
    }
697
698
    // end goup indicator
699
112k
    if (c == ']') {
700
11.5k
      ec = 1;
701
11.5k
      c = 0;
702
11.5k
    }
703
704
    // add character of group to list
705
112k
    if ((grp == 1) && (c != 0)) {
706
40.2k
      *(mbr + nm) = c;
707
40.2k
      nm++;
708
40.2k
      c = 0;
709
40.2k
    }
710
711
    // end of condition 
712
112k
    if (c != 0) {
713
41.4k
      ec = 1;
714
41.4k
    }
715
716
    
717
112k
    if (ec) {
718
52.9k
      if (grp == 1) {
719
11.5k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
16.7k
          for (j=0;j<nm;j++) {
722
13.4k
            k = (unsigned int) mbr[j];
723
13.4k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
13.4k
          }
725
8.25k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
2.12M
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
35.0k
          for (j=0;j<nm;j++) {
729
26.7k
            k = (unsigned int) mbr[j];
730
26.7k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
26.7k
          }
732
8.25k
        }
733
11.5k
        neg = 0;
734
11.5k
        grp = 0;   
735
11.5k
        nm = 0;
736
41.4k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
41.4k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
41.4k
        } else {  
743
41.4k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
41.4k
        }
745
41.4k
      }
746
52.9k
      n++;
747
52.9k
      ec = 0;
748
52.9k
    }
749
750
751
112k
    i++;
752
112k
  }
753
22.4k
  cds->num = n;
754
22.4k
  return;
755
23.4k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
742k
{
762
742k
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
742k
  PfxEntry * pe = pStart[0];
766
742k
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
742k
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
742k
  PfxEntry * pptr = pStart[sp];
774
775
2.23M
  while (pptr) {
776
1.49M
    if (isSubset(pptr->key(),word)) {
777
155k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
155k
      pptr = pptr->next_eq;
779
1.33M
    } else {
780
1.33M
      pptr = pptr->next_ne;
781
1.33M
    }
782
1.49M
  }
783
    
784
742k
  return false;
785
742k
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
927k
{
793
927k
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
927k
  SfxEntry * se = sStart[0];
797
43.1M
  while (se) {
798
42.2M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
42.2M
    se = se->next;
800
42.2M
  }
801
  
802
  // now handle the general case
803
927k
  byte sp = *((const byte *)(word + word.size() - 1));
804
927k
  SfxEntry * sptr = sStart[sp];
805
806
10.1M
  while (sptr) {
807
9.22M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
6.56M
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
6.56M
      sptr = sptr->next_eq;
810
6.56M
    } else {
811
2.66M
      sptr = sptr->next_ne;
812
2.66M
    }
813
9.22M
  }
814
    
815
926k
  return false;
816
927k
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
742k
{
822
742k
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
741k
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
741k
  ParmString pword = word;
827
741k
  ParmString sword = word;
828
741k
  CharVector lower;
829
741k
  if (cp == FirstUpper) {
830
36.9k
    lower.append(word, word.size() + 1);
831
36.9k
    lower[0] = lang->to_lower(word[0]);
832
36.9k
    pword = ParmString(lower.data(), lower.size() - 1);
833
704k
  } else if (cp == AllUpper) {
834
55.8k
    lower.resize(word.size() + 1);
835
55.8k
    unsigned int i = 0;
836
2.00M
    for (; i != word.size(); ++i)
837
1.94M
      lower[i] = lang->to_lower(word[i]);
838
55.8k
    lower[i] = '\0';
839
55.8k
    pword = ParmString(lower.data(), lower.size() - 1);
840
55.8k
    sword = pword;
841
55.8k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
741k
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
741k
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
740k
  if (cp == FirstUpper) {
853
36.9k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
703k
  } else {
855
703k
    return false;
856
703k
  }
857
740k
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
1.56k
{
861
1.56k
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
1.56k
  CheckInfo ci;
863
1.56k
  gi->reset();
864
1.56k
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
1.56k
  if (cp == AllUpper) return;
866
1.55k
  if (cp != FirstUpper)
867
1.18k
    prefix_check(li, word, ci, gi, cross);
868
1.55k
  suffix_check(li, word, ci, gi, 0, NULL);
869
1.55k
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
534k
{
874
534k
  byte * empty = (byte *)buf.alloc(1);
875
534k
  *empty = 0;
876
877
534k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
534k
  byte * suf_e = suf;
879
534k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
534k
  byte * csuf_e = csuf;
881
882
534k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
534k
  WordAff * cur = head;
884
534k
  cur->word = buf.dup(word);
885
534k
  cur->aff  = suf;
886
887
534k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
1.30M
       c != end; 
889
767k
       ++c) 
890
767k
  {
891
767k
    if (sFlag[*c]) *suf_e++ = *c; 
892
767k
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
1.05M
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
285k
      SimpleString newword = p->add(word, buf);
896
285k
      if (!newword) continue;
897
122k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
122k
      cur = cur->next;
899
122k
      cur->word = newword;
900
122k
      cur->aff = p->allow_cross() ? csuf : empty;
901
122k
    }
902
767k
  }
903
904
534k
  *suf_e = 0;
905
534k
  *csuf_e = 0;
906
534k
  cur->next = 0;
907
908
534k
  if (limit == 0) return head;
909
910
534k
  WordAff * * end = &cur->next;
911
534k
  WordAff * * very_end = end;
912
534k
  size_t nsuf_s = suf_e - suf + 1;
913
914
1.19M
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
657k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
657k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
657k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
657k
    (*cur)->aff = nsuf;
919
657k
  }
920
921
534k
  return head;
922
534k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
657k
{
929
657k
  WordAff * head = 0;
930
657k
  if (l) head = **l;
931
657k
  WordAff * * cur = l ? *l : &head;
932
657k
  bool expanded     = false;
933
657k
  bool not_expanded = false;
934
657k
  if (!orig_word) orig_word = word;
935
936
1.46M
  while (*aff) {
937
803k
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
25.4M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
24.6M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
24.6M
        if (!newword) continue;
941
4.64M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
4.64M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
4.64M
        (*cur)->word = newword;
944
4.64M
        (*cur)->aff  = (const byte *)EMPTY;
945
4.64M
        cur = &(*cur)->next;
946
4.64M
        expanded = true;
947
4.64M
      }
948
803k
    }
949
803k
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
803k
    ++aff;
951
803k
  }
952
657k
  *cur = 0;
953
657k
  if (new_aff) *new_aff = 0;
954
657k
  if (l) *l = cur;
955
657k
  return head;
956
657k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
603k
{
986
603k
  SpellerImpl::WS::const_iterator i = begin;
987
603k
  const char * g = 0;
988
603k
  if (mode == Word) {
989
165k
    do {
990
165k
      (*i)->lookup(word, c, o);
991
168k
      for (;!o.at_end(); o.adv()) {
992
2.52k
        if (TESTAFF(o.aff, achar))
993
236
          return 1;
994
2.28k
        else
995
2.28k
          g = o.word;
996
2.52k
      }
997
165k
      ++i;
998
165k
    } while (i != end);
999
528k
  } else if (mode == Clean) {
1000
528k
    do {
1001
528k
      (*i)->clean_lookup(word, o);
1002
540k
      for (;!o.at_end(); o.adv()) {
1003
13.1k
        if (TESTAFF(o.aff, achar))
1004
788
          return 1;
1005
12.3k
        else
1006
12.3k
          g = o.word;
1007
13.1k
      }
1008
527k
      ++i;
1009
527k
    } while (i != end);
1010
528k
  } else if (gi) {
1011
501
    g = gi->dup(word);
1012
501
  }
1013
602k
  if (gi && g) {
1014
1.60k
    CheckInfo * ci = gi->add();
1015
1.60k
    ci->word = g;
1016
1.60k
    return -1;
1017
1.60k
  }
1018
600k
  return 0;
1019
602k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
285k
{
1044
285k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
285k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
285k
    const byte * cp = (const byte *) word.str;
1048
350k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
227k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
163k
        break;
1051
227k
    }
1052
285k
    if (cond >= conds->num) {
1053
      /* */
1054
122k
      int alen = word.size - stripl;
1055
122k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
122k
      if (appndl) memcpy(newword, appnd, appndl);
1057
122k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
122k
      return SimpleString(newword, alen + appndl);
1059
122k
    }
1060
285k
  }
1061
163k
  return SimpleString();
1062
285k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
155k
{
1069
155k
  unsigned int    cond; // condition number being examined
1070
155k
  unsigned              tmpl;   // length of tmpword
1071
155k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
155k
  byte *  cp;   
1073
155k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
155k
  tmpl = word.size() - appndl;
1081
1082
155k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
153k
    if (stripl) strcpy (tmpword, strip);
1088
153k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
153k
    cp = (byte *)tmpword;
1096
154k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
7.49k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
7.49k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
153k
    if (cond >= conds->num) {
1104
147k
      CheckInfo * lci = 0;
1105
147k
      CheckInfo * guess = 0;
1106
147k
      tmpl += stripl;
1107
1108
147k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
147k
      if (res == 1) {
1111
1112
1
        lci = &ci;
1113
1
        lci->word = wordinfo.word;
1114
1
        goto quit;
1115
        
1116
147k
      } else if (res == -1) {
1117
1118
184
        guess = gi->head;
1119
1120
184
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
147k
      if (gi)
1127
6.61k
        lci = gi->head;
1128
      
1129
147k
      if (cross && xpflg & XPRODUCT) {
1130
147k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
147k
                                 ci, gi,
1132
147k
                                 XPRODUCT, (AffEntry *)this)) {
1133
26
          lci = &ci;
1134
          
1135
147k
        } else if (gi) {
1136
          
1137
6.61k
          CheckInfo * stop = lci;
1138
6.61k
          for (lci = gi->head; 
1139
6.76k
               lci != stop; 
1140
6.61k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
152
          {
1142
152
            lci->pre_flag = achar;
1143
152
            lci->pre_strip_len = stripl;
1144
152
            lci->pre_add_len = appndl;
1145
152
            lci->pre_add = appnd;
1146
152
          }
1147
          
1148
140k
        } else {
1149
          
1150
140k
          lci = 0;
1151
          
1152
140k
        }
1153
147k
      }
1154
    
1155
147k
      if (guess)
1156
184
        lci = guess;
1157
      
1158
147k
    quit:
1159
147k
      if (lci) {
1160
852
        lci->pre_flag = achar;
1161
852
        lci->pre_strip_len = stripl;
1162
852
        lci->pre_add_len = appndl;
1163
852
        lci->pre_add = appnd;
1164
852
      }
1165
147k
      if (lci == &ci) return true;
1166
147k
    }
1167
153k
  }
1168
155k
  return false;
1169
155k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
24.6M
{
1190
24.6M
  int cond;
1191
  /* make sure all conditions match */
1192
24.6M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
18.9M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
37.3M
    for (cond = conds->num; --cond >=0; ) {
1195
32.6M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
14.3M
        break;
1197
32.6M
    }
1198
18.9M
    if (cond < 0) {
1199
4.64M
      int alen = word.size - stripl;
1200
4.64M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
4.64M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
4.64M
      memcpy(newword, word, alen);
1204
4.64M
      memcpy(newword + alen, appnd, appndl + 1);
1205
4.64M
      return SimpleString(newword, alen + appndl);
1206
4.64M
    }
1207
18.9M
  }
1208
20.0M
  return SimpleString();
1209
24.6M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
48.8M
{
1216
48.8M
  unsigned              tmpl;    // length of tmpword 
1217
48.8M
  int     cond;    // condition beng examined
1218
48.8M
  WordEntry             wordinfo;        // hash entry pointer
1219
48.8M
  byte *  cp;
1220
48.8M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
48.8M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
48.8M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
203
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
48.8M
  tmpl = word.size() - appndl;
1235
1236
48.8M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
41.3M
    strcpy (tmpword, word);
1243
41.3M
    cp = (byte *)(tmpword + tmpl);
1244
41.3M
    if (stripl) {
1245
39.5M
      strcpy ((char *)cp, strip);
1246
39.5M
      tmpl += stripl;
1247
39.5M
      cp = (byte *)(tmpword + tmpl);
1248
39.5M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
87.6M
    for (cond = conds->num;  --cond >= 0; ) {
1256
87.1M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
87.1M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
41.3M
    if (cond < 0) {
1263
456k
      CheckInfo * lci = 0;
1264
456k
      tmpl += stripl;
1265
456k
      const SensitiveCompare * cmp = 
1266
456k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
456k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
456k
      if (res == 1
1269
456k
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
907
      {
1271
907
        lci = &ci;
1272
907
        lci->word = wordinfo.word;
1273
455k
      } else if (res == 1 && gi) {
1274
40
        lci = gi->add();
1275
40
        lci->word = wordinfo.word;
1276
455k
      } else if (res == -1) { // gi must be defined
1277
1.42k
        lci = gi->head;
1278
1.42k
      }
1279
1280
456k
      if (lci) {
1281
2.37k
        lci->suf_flag = achar;
1282
2.37k
        lci->suf_strip_len = stripl;
1283
2.37k
        lci->suf_add_len = appndl;
1284
2.37k
        lci->suf_add = appnd;
1285
2.37k
      }
1286
      
1287
456k
      if (lci == &ci) return true;
1288
456k
    }
1289
41.3M
  }
1290
48.8M
  return false;
1291
48.8M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
1.02k
{
1303
1.02k
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
1.02k
  String file;
1307
1.02k
  file += lang->data_dir();
1308
1.02k
  file += '/';
1309
1.02k
  file += lang->name();
1310
1.02k
  file += "_affix.dat";
1311
1.02k
  AffixMgr * affix;
1312
1.02k
  affix = new AffixMgr(lang);
1313
1.02k
  PosibErrBase pe = affix->setup(file, iconv);
1314
1.02k
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
1.02k
  } else {
1318
1.02k
    return affix;
1319
1.02k
  }
1320
1.02k
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/