Coverage Report

Created: 2025-08-29 06:09

/src/aspell/modules/speller/default/affix.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of The New Aspell
2
// Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
//
7
// This code is based on the the MySpell affix code:
8
//
9
/*
10
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11
 * Contributors.  All rights reserved.
12
 *
13
 * Redistribution and use in source and binary forms, with or without
14
 * modification, are permitted provided that the following conditions
15
 * are met:
16
 *
17
 * 1. Redistributions of source code must retain the above copyright
18
 *    notice, this list of conditions and the following disclaimer.
19
 *
20
 * 2. Redistributions in binary form must reproduce the above copyright
21
 *    notice, this list of conditions and the following disclaimer in the
22
 *    documentation and/or other materials provided with the distribution.
23
 *
24
 * 3. All modifications to the source code must be clearly marked as
25
 *    such.  Binary redistributions based on modified source code
26
 *    must be clearly marked as modified versions in the documentation
27
 *    and/or other materials provided with the distribution.
28
 *
29
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
 * SUCH DAMAGE.
41
 *
42
 */
43
44
#include <cstdlib>
45
#include <cstring>
46
#include <cstdio>
47
48
//#include "iostream.hpp"
49
50
#include "affix.hpp"
51
#include "errors.hpp"
52
#include "getdata.hpp"
53
#include "parm_string.hpp"
54
#include "check_list.hpp"
55
#include "speller_impl.hpp"
56
#include "vararray.hpp"
57
#include "lsort.hpp"
58
#include "hash-t.hpp"
59
60
#include "gettext.h"
61
62
using namespace std;
63
64
namespace aspeller {
65
66
typedef unsigned char byte;
67
static char EMPTY[1] = {0};
68
69
//////////////////////////////////////////////////////////////////////
70
//
71
// Entry struct definations
72
//
73
74
struct Conds
75
{
76
  char * str;
77
  unsigned num;
78
  char conds[SETSIZE];
79
37.2M
  char get(byte i) const {return conds[i];}
80
};
81
82
struct AffEntry
83
{
84
  const char *   appnd;
85
  const char *   strip;
86
  byte           appndl;
87
  byte           stripl;
88
  byte           xpflg;
89
  char           achar;
90
  const Conds *  conds;
91
  //unsigned int numconds;
92
  //char         conds[SETSIZE];
93
};
94
95
// A Prefix Entry
96
  
97
struct PfxEntry : public AffEntry
98
{
99
  PfxEntry * next;
100
  PfxEntry * next_eq;
101
  PfxEntry * next_ne;
102
  PfxEntry * flag_next;
103
4.60k
  PfxEntry() {}
104
105
  bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106
             ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107
108
60.5k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
109
4.60k
  inline byte flag() const { return achar;  }
110
713k
  inline const char *  key() const  { return appnd;  }
111
  bool applicable(SimpleString) const;
112
  SimpleString add(SimpleString, ObjStack & buf) const;
113
};
114
115
// A Suffix Entry
116
117
struct SfxEntry : public AffEntry
118
{
119
  const char * rappnd; // this is set in AffixMgr::build_sfxlist
120
  
121
  SfxEntry *   next;
122
  SfxEntry *   next_eq;
123
  SfxEntry *   next_ne;
124
  SfxEntry *   flag_next;
125
126
428k
  SfxEntry() {}
127
128
  bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129
             int optflags, AffEntry * ppfx);
130
131
326k
  inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
132
428k
  inline byte flag() const { return achar;  }
133
304M
  inline const char *  key() const  { return rappnd; } 
134
  bool applicable(SimpleString) const;
135
  SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136
};
137
138
//////////////////////////////////////////////////////////////////////
139
//
140
// Utility functions declarations
141
//
142
143
/* return 1 if s1 is subset of s2 */
144
static bool isSubset(const char * s1, const char * s2)
145
146M
{
146
614M
  while( *s1 && (*s1 == *s2) ) {
147
468M
    s1++;
148
468M
    s2++;
149
468M
  }
150
146M
  return (*s1 == '\0');
151
146M
}
152
153
// return 1 if s1 (reversed) is a leading subset of end of s2
154
static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155
3.43M
{
156
7.75M
  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157
4.31M
    s1++;
158
4.31M
    end_of_s2--;
159
4.31M
    len --;
160
4.31M
  }
161
3.43M
  return (*s1 == '\0');
162
3.43M
}
163
164
template <class T>
165
struct AffixLess
166
{
167
4.61M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::PfxEntry>::operator()(aspeller::PfxEntry*, aspeller::PfxEntry*) const
Line
Count
Source
167
4.53k
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
aspeller::AffixLess<aspeller::SfxEntry>::operator()(aspeller::SfxEntry*, aspeller::SfxEntry*) const
Line
Count
Source
167
4.61M
  bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168
};
169
170
// struct StringLookup {
171
//   struct Parms {
172
//     typedef const char * Value;
173
//     typedef const char * Key;
174
//     static const bool is_multi = false;
175
//     hash<const char *> hfun;
176
//     size_t hash(const char * s) {return hfun(s);}
177
//     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178
//     const char * key(const char * c) {return c;}
179
//   };
180
//   typedef HashTable<Parms> Lookup;
181
//   Lookup lookup;
182
//   ObjStack * data_buf;
183
//   StringLookup(ObjStack * b) : data_buf(b) {}
184
//   const char * dup(const char * orig) {
185
//     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186
//     if (res.second) *res.first = data_buf->dup(orig);
187
//     return *res.first;
188
//     //return data_buf->dup(orig);
189
//   }
190
// };
191
192
struct CondsLookupParms {
193
  typedef const Conds * Value;
194
  typedef const char * Key;
195
  static const bool is_multi = false;
196
  acommon::hash<const char *> hfun;
197
448k
  size_t hash(const char * s) {return hfun(s);}
198
668k
  bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
199
684k
  const char * key(const Conds * c) {return c->str;}
200
};
201
202
typedef HashTable<CondsLookupParms> CondsLookup;
203
204
// normalizes and checks the cond_str
205
// returns the length of the new string or -1 if invalid
206
static int normalize_cond_str(char * str)
207
432k
{
208
432k
  char * s = str;
209
432k
  char * d = str;
210
2.22M
  while (*s) {
211
1.78M
    if (*s != '[') {
212
1.73M
      *d++ = *s++;
213
1.73M
    } else if (s[1] == '\0' || s[1] == ']') {
214
0
      return -1;
215
53.7k
    } else if (s[2] == ']') {
216
0
      *d++ = s[1];
217
0
      s += 3;
218
53.7k
    } else {
219
53.7k
      *d++ = *s++;
220
53.7k
      if (*s == '^') *d++ = *s++;
221
203k
      while (*s != ']') {
222
149k
        if (*s == '\0' || *s == '[') return -1;
223
149k
        char * min = s;
224
321k
        for (char * i = s + 1; *i != ']'; ++i) {
225
172k
          if ((byte)*i < (byte)*min) min = i;}
226
149k
        char c = *s;
227
149k
        *d++ = *min;
228
149k
        *min = c;
229
149k
        ++s;
230
149k
      }
231
53.7k
      *d++ = *s++;
232
53.7k
    }
233
1.78M
  }
234
432k
  *d = '\0';
235
432k
  return d - str;
236
432k
}
237
238
static void encodeit(CondsLookup &, ObjStack &, 
239
                     AffEntry * ptr, char * cs);
240
241
//////////////////////////////////////////////////////////////////////
242
//
243
// Affix Manager
244
//
245
246
PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247
408
{
248
  // register hash manager and load affix data from aff file
249
  //cpdmin = 3;  // default value
250
408
  max_strip_ = 0;
251
104k
  for (int i=0; i < SETSIZE; i++) {
252
104k
    pStart[i] = NULL;
253
104k
    sStart[i] = NULL;
254
104k
    pFlag[i] = NULL;
255
104k
    sFlag[i] = NULL;
256
104k
    max_strip_f[i] = 0;
257
104k
  }
258
408
  return parse_file(affpath, iconv);
259
408
}
260
261
AffixMgr::AffixMgr(const Language * l) 
262
408
  : lang(l), data_buf(1024*16) {}
263
264
408
AffixMgr::~AffixMgr() {}
265
266
static inline void max_(int & lhs, int rhs) 
267
590k
{
268
590k
  if (lhs < rhs) lhs = rhs;
269
590k
}
270
271
// read in aff file and build up prefix and suffix entry objects 
272
PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273
408
{
274
  // io buffers
275
408
  String buf; DataPair dp;
276
277
408
  CondsLookup conds_lookup;
278
 
279
  // open the affix file
280
408
  affix_file = data_buf.dup(affpath);
281
408
  FStream afflst;
282
408
  RET_ON_ERR(afflst.open(affpath,"r"));
283
284
  // step one is to parse the affix file building up the internal
285
  // affix data structures
286
287
  // read in each line ignoring any that do not
288
  // start with a known line type indicator
289
290
408
  char prev_aff = '\0';
291
292
64.9k
  while (getdata_pair(afflst,dp,buf)) {
293
64.5k
    char affix_type = ' ';
294
295
    /* parse in the name of the character set used by the .dict and .aff */
296
297
64.5k
    if (dp.key == "SET") {
298
408
      String buf;
299
408
      encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300
408
      if (strcmp(encoding, lang->data_encoding()) != 0)
301
0
        return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302
408
    }
303
304
    /* parse in the flag used by the controlled compound words */
305
    //else if (d.key == "COMPOUNDFLAG")
306
    //  compound = data_buf.dup(d.value);
307
308
    /* parse in the flag used by the controlled compound words */
309
    //else if (d.key == "COMPOUNDMIN")
310
    //  cpdmin = atoi(d.value); // FiXME
311
312
    //else if (dp.key == "TRY" || dp.key == "REP");
313
314
64.1k
    else if (dp.key == "PFX" || dp.key == "SFX")
315
10.6k
      affix_type = dp.key[0];
316
317
64.5k
    if (affix_type == ' ') continue;
318
319
    //
320
    // parse this affix: P - prefix, S - suffix
321
    //
322
323
10.6k
    int numents = 0;      // number of affentry structures to parse
324
10.6k
    char achar='\0';      // affix char identifier
325
10.6k
    short xpflg=0;
326
10.6k
    AffEntry * nptr;
327
10.6k
    {
328
      // split affix header line into pieces
329
10.6k
      split(dp);
330
10.6k
      if (dp.key.empty()) goto error;
331
      // key is affix char
332
10.6k
      const char * astr = iconv(dp.key);
333
10.6k
      if (astr[0] == '\0' || astr[1] != '\0') goto error;
334
10.6k
      achar = astr[0];
335
10.6k
      if (achar == prev_aff) goto error_count;
336
10.6k
      prev_aff = achar;
337
338
10.6k
      split(dp);
339
10.6k
      if (dp.key.size != 1 || 
340
10.6k
          !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341
      // key is cross product indicator 
342
10.6k
      if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343
    
344
10.6k
      split(dp);
345
10.6k
      if (dp.key.empty()) goto error;
346
      // key is number of affentries
347
      
348
10.6k
      numents = atoi(dp.key); 
349
  
350
443k
      for (int j = 0; j < numents; j++) {
351
432k
        getdata_pair(afflst, dp, buf);
352
353
432k
        if (affix_type == 'P') {
354
4.60k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355
4.60k
          new (nptr) PfxEntry;
356
428k
        } else {
357
428k
          nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358
428k
          new (nptr) SfxEntry;
359
428k
        }
360
361
432k
        nptr->xpflg = xpflg;
362
363
432k
        split(dp);
364
432k
        if (dp.key.empty()) goto error;
365
        // key is affix charter
366
432k
        if (iconv(dp.key)[0] != achar) goto error_count;
367
432k
        nptr->achar = achar;
368
 
369
432k
        split(dp);
370
432k
        if (dp.key.empty()) goto error;
371
        // key is strip 
372
432k
        if (dp.key != "0") {
373
295k
          ParmString s0(iconv(dp.key));
374
295k
          max_(max_strip_, s0.size());
375
295k
          max_(max_strip_f[(byte)achar], s0.size());
376
295k
          nptr->strip = data_buf.dup(s0);
377
295k
          nptr->stripl = s0.size();
378
295k
        } else {
379
137k
          nptr->strip  = "";
380
137k
          nptr->stripl = 0;
381
137k
        }
382
    
383
432k
        split(dp);
384
432k
        if (dp.key.empty()) goto error;
385
        // key is affix string or 0 for null
386
432k
        if (dp.key != "0") {
387
431k
          nptr->appnd  = data_buf.dup(iconv(dp.key));
388
431k
          nptr->appndl = strlen(nptr->appnd);
389
431k
        } else {
390
928
          nptr->appnd  = "";
391
928
          nptr->appndl = 0;
392
928
        }
393
    
394
432k
        split(dp);
395
432k
        if (dp.key.empty()) goto error;
396
        // key is the conditions descriptions
397
432k
        char * cond = iconv(dp.key);
398
432k
        int cond_len = normalize_cond_str(cond);
399
432k
        if (cond_len < 0)
400
0
          return (make_err(invalid_cond, MsgConv(lang)(cond))
401
0
                  .with_file(affix_file, dp.line_num));
402
432k
        if (nptr->stripl != 0) {
403
295k
          char * cc = cond;
404
295k
          if (affix_type == 'S') cc += cond_len - nptr->stripl;
405
295k
          if (cond_len < nptr->stripl || 
406
295k
              memcmp(cc, nptr->strip, nptr->stripl) != 0)
407
0
            return (make_err(invalid_cond_strip, 
408
0
                             MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409
0
                    .with_file(affix_file, dp.line_num));
410
295k
        }
411
432k
        encodeit(conds_lookup, data_buf, nptr, cond);
412
    
413
        // now create SfxEntry or PfxEntry objects and use links to
414
        // build an ordered (sorted by affix string) list
415
432k
        if (affix_type == 'P')
416
4.60k
          build_pfxlist(static_cast<PfxEntry *>(nptr));
417
428k
        else
418
428k
          build_sfxlist(static_cast<SfxEntry *>(nptr)); 
419
432k
      }
420
10.6k
    }
421
10.6k
    continue;
422
10.6k
  error:
423
0
    return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424
0
  error_count:
425
0
    return make_err(corrupt_affix, MsgConv(lang)(achar), 
426
0
                    _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427
10.6k
  }
428
408
  afflst.close();
429
430
  // now we can speed up performance greatly taking advantage of the 
431
  // relationship between the affixes and the idea of "subsets".
432
433
  // View each prefix as a potential leading subset of another and view
434
  // each suffix (reversed) as a potential trailing subset of another.
435
436
  // To illustrate this relationship if we know the prefix "ab" is
437
  // found in the word to examine, only prefixes that "ab" is a
438
  // leading subset of need be examined.  Furthermore is "ab" is not
439
  // present then none of the prefixes that "ab" is is a subset need
440
  // be examined.
441
442
  // The same argument goes for suffix string that are reversed.
443
444
  // Then to top this off why not examine the first char of the word
445
  // to quickly limit the set of prefixes to examine (i.e. the
446
  // prefixes to examine must be leading supersets of the first
447
  // character of the word (if they exist)
448
 
449
  // To take advantage of this "subset" relationship, we need to add
450
  // two links from entry.  One to take next if the current prefix
451
  // is found (call it nexteq) and one to take next if the current
452
  // prefix is not found (call it nextne).
453
454
  // Since we have built ordered lists, all that remains is to
455
  // properly initialize the nextne and nexteq pointers that relate
456
  // them
457
458
408
  process_pfx_order();
459
408
  process_sfx_order();
460
461
  //CERR.printf("%u\n", data_buf.calc_size()/1024);
462
463
408
  return no_err;
464
465
408
}
466
467
468
// we want to be able to quickly access prefix information
469
// both by prefix flag, and sorted by prefix string itself
470
// so we need to set up two indexes
471
472
PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473
4.60k
{
474
4.60k
  PfxEntry * ptr;
475
4.60k
  PfxEntry * ep = pfxptr;
476
477
  // get the right starting point 
478
4.60k
  const char * key = ep->key();
479
4.60k
  const byte flg = ep->flag();
480
481
  // first index by flag which must exist
482
4.60k
  ptr = pFlag[flg];
483
4.60k
  ep->flag_next = ptr;
484
4.60k
  pFlag[flg] = ep;
485
486
  // next insert the affix string, it will be sorted latter
487
488
4.60k
  byte sp = *((const byte *)key);
489
4.60k
  ptr = pStart[sp];
490
4.60k
  ep->next = ptr;
491
4.60k
  pStart[sp] = ep;
492
4.60k
  return no_err;
493
4.60k
}
494
495
// we want to be able to quickly access suffix information
496
// both by suffix flag, and sorted by the reverse of the
497
// suffix string itself; so we need to set up two indexes
498
499
PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500
428k
{
501
428k
  SfxEntry * ptr;
502
428k
  SfxEntry * ep = sfxptr;
503
428k
  char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504
428k
  sfxptr->rappnd = tmp;
505
506
  // reverse the string
507
428k
  char * dest = tmp + sfxptr->appndl;
508
428k
  *dest-- = 0;
509
428k
  const char * src = sfxptr->appnd;
510
2.92M
  for (; dest >= tmp; --dest, ++src)
511
2.49M
    *dest = *src;
512
513
  /* get the right starting point */
514
428k
  const char * key = ep->key();
515
428k
  const byte flg = ep->flag();
516
517
  // first index by flag which must exist
518
428k
  ptr = sFlag[flg];
519
428k
  ep->flag_next = ptr;
520
428k
  sFlag[flg] = ep;
521
522
  // next insert the affix string, it will be sorted latter
523
    
524
428k
  byte sp = *((const byte *)key);
525
428k
  ptr = sStart[sp];
526
428k
  ep->next = ptr;
527
428k
  sStart[sp] = ep;
528
428k
  return no_err;
529
428k
}
530
531
532
533
// initialize the PfxEntry links NextEQ and NextNE to speed searching
534
PosibErr<void> AffixMgr::process_pfx_order()
535
408
{
536
408
  PfxEntry* ptr;
537
538
  // loop through each prefix list starting point
539
104k
  for (int i=1; i < SETSIZE; i++) {
540
541
104k
    ptr = pStart[i];
542
543
104k
    if (ptr && ptr->next)
544
584
      ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545
546
    // look through the remainder of the list
547
    //  and find next entry with affix that 
548
    // the current one is not a subset of
549
    // mark that as destination for NextNE
550
    // use next in list that you are a subset
551
    // of as NextEQ
552
553
108k
    for (; ptr != NULL; ptr = ptr->next) {
554
555
4.60k
      PfxEntry * nptr = ptr->next;
556
6.63k
      for (; nptr != NULL; nptr = nptr->next) {
557
3.83k
        if (! isSubset( ptr->key() , nptr->key() )) break;
558
3.83k
      }
559
4.60k
      ptr->next_ne = nptr;
560
4.60k
      ptr->next_eq = NULL;
561
4.60k
      if ((ptr->next) && isSubset(ptr->key() , 
562
2.00k
                                  (ptr->next)->key())) 
563
496
        ptr->next_eq = ptr->next;
564
4.60k
    }
565
566
    // now clean up by adding smart search termination strings
567
    // if you are already a superset of the previous prefix
568
    // but not a subset of the next, search can end here
569
    // so set NextNE properly
570
571
104k
    ptr = pStart[i];
572
108k
    for (; ptr != NULL; ptr = ptr->next) {
573
4.60k
      PfxEntry * nptr = ptr->next;
574
4.60k
      PfxEntry * mptr = NULL;
575
6.63k
      for (; nptr != NULL; nptr = nptr->next) {
576
3.83k
        if (! isSubset(ptr->key(),nptr->key())) break;
577
2.03k
        mptr = nptr;
578
2.03k
      }
579
4.60k
      if (mptr) mptr->next_ne = NULL;
580
4.60k
    }
581
104k
  }
582
408
  return no_err;
583
408
}
584
585
586
587
// initialize the SfxEntry links NextEQ and NextNE to speed searching
588
PosibErr<void> AffixMgr::process_sfx_order()
589
408
{
590
408
  SfxEntry* ptr;
591
592
  // loop through each prefix list starting point
593
104k
  for (int i=1; i < SETSIZE; i++) {
594
595
104k
    ptr = sStart[i];
596
597
104k
    if (ptr && ptr->next)
598
3.56k
      ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599
600
    // look through the remainder of the list
601
    //  and find next entry with affix that 
602
    // the current one is not a subset of
603
    // mark that as destination for NextNE
604
    // use next in list that you are a subset
605
    // of as NextEQ
606
607
531k
    for (; ptr != NULL; ptr = ptr->next) {
608
427k
      SfxEntry * nptr = ptr->next;
609
72.6M
      for (; nptr != NULL; nptr = nptr->next) {
610
72.6M
        if (! isSubset(ptr->key(),nptr->key())) break;
611
72.6M
      }
612
427k
      ptr->next_ne = nptr;
613
427k
      ptr->next_eq = NULL;
614
427k
      if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key())) 
615
348k
        ptr->next_eq = ptr->next;
616
427k
    }
617
618
619
    // now clean up by adding smart search termination strings:
620
    // if you are already a superset of the previous suffix
621
    // but not a subset of the next, search can end here
622
    // so set NextNE properly
623
624
104k
    ptr = sStart[i];
625
531k
    for (; ptr != NULL; ptr = ptr->next) {
626
427k
      SfxEntry * nptr = ptr->next;
627
427k
      SfxEntry * mptr = NULL;
628
72.6M
      for (; nptr != NULL; nptr = nptr->next) {
629
72.6M
        if (! isSubset(ptr->key(),nptr->key())) break;
630
72.2M
        mptr = nptr;
631
72.2M
      }
632
427k
      if (mptr) mptr->next_ne = NULL;
633
427k
    }
634
104k
  }
635
408
  return no_err;
636
408
}
637
638
// takes aff file condition string and creates the
639
// conds array - please see the appendix at the end of the
640
// file affentry.cxx which describes what is going on here
641
// in much more detail
642
643
static void encodeit(CondsLookup & l, ObjStack & buf, 
644
                     AffEntry * ptr, char * cs)
645
432k
{
646
432k
  byte c;
647
432k
  int i, j, k;
648
649
  // see if we already have this conds matrix
650
651
432k
  CondsLookup::iterator itr = l.find(cs);
652
432k
  if (!(itr == l.end())) {
653
422k
    ptr->conds = *itr;
654
422k
    return;
655
422k
  }
656
657
10.0k
  Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658
10.0k
  cds->str = buf.dup(cs);
659
10.0k
  l.insert(cds);
660
10.0k
  ptr->conds = cds;
661
662
10.0k
  int nc = strlen(cs);
663
10.0k
  VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664
665
  // now clear the conditions array
666
10.0k
  memset(cds->conds, 0, sizeof(cds->conds));
667
668
  // now parse the string to create the conds array
669
  
670
10.0k
  int neg = 0;   // complement indicator
671
10.0k
  int grp = 0;   // group indicator
672
10.0k
  int n = 0;     // number of conditions
673
10.0k
  int ec = 0;    // end condition indicator
674
10.0k
  int nm = 0;    // number of member in group
675
676
  // if no condition just return
677
10.0k
  if (strcmp(cs,".")==0) {
678
408
    cds->num = 0;
679
408
    return;
680
408
  }
681
682
9.64k
  i = 0;
683
57.8k
  while (i < nc) {
684
48.2k
    c = *((byte *)(cs + i));
685
686
    // start group indicator
687
48.2k
    if (c == '[') {
688
4.72k
      grp = 1;
689
4.72k
      c = 0;
690
4.72k
    }
691
692
    // complement flag
693
48.2k
    if ((grp == 1) && (c == '^')) {
694
3.35k
      neg = 1;
695
3.35k
      c = 0;
696
3.35k
    }
697
698
    // end goup indicator
699
48.2k
    if (c == ']') {
700
4.72k
      ec = 1;
701
4.72k
      c = 0;
702
4.72k
    }
703
704
    // add character of group to list
705
48.2k
    if ((grp == 1) && (c != 0)) {
706
16.4k
      *(mbr + nm) = c;
707
16.4k
      nm++;
708
16.4k
      c = 0;
709
16.4k
    }
710
711
    // end of condition 
712
48.2k
    if (c != 0) {
713
18.9k
      ec = 1;
714
18.9k
    }
715
716
    
717
48.2k
    if (ec) {
718
23.6k
      if (grp == 1) {
719
4.72k
        if (neg == 0) {
720
          // set the proper bits in the condition array vals for those chars
721
7.03k
          for (j=0;j<nm;j++) {
722
5.65k
            k = (unsigned int) mbr[j];
723
5.65k
            cds->conds[k] = cds->conds[k] | (1 << n);
724
5.65k
          }
725
3.35k
        } else {
726
          // complement so set all of them and then unset indicated ones
727
861k
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728
14.1k
          for (j=0;j<nm;j++) {
729
10.7k
            k = (unsigned int) mbr[j];
730
10.7k
            cds->conds[k] = cds->conds[k] & ~(1 << n);
731
10.7k
          }
732
3.35k
        }
733
4.72k
        neg = 0;
734
4.72k
        grp = 0;   
735
4.72k
        nm = 0;
736
18.9k
      } else {
737
        // not a group so just set the proper bit for this char
738
        // but first handle special case of . inside condition
739
18.9k
        if (c == '.') {
740
          // wild card character so set them all
741
0
          for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742
18.9k
        } else {  
743
18.9k
          cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744
18.9k
        }
745
18.9k
      }
746
23.6k
      n++;
747
23.6k
      ec = 0;
748
23.6k
    }
749
750
751
48.2k
    i++;
752
48.2k
  }
753
9.64k
  cds->num = n;
754
9.64k
  return;
755
10.0k
}
756
757
758
// check word for prefixes
759
bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word, 
760
                             CheckInfo & ci, GuessInfo * gi, bool cross) const
761
382k
{
762
382k
  if (word.empty()) return false;
763
 
764
  // first handle the special case of 0 length prefixes
765
382k
  PfxEntry * pe = pStart[0];
766
382k
  while (pe) {
767
0
    if (pe->check(linf,this,word,ci,gi)) return true;
768
0
    pe = pe->next;
769
0
  }
770
  
771
  // now handle the general case
772
382k
  byte sp = *reinterpret_cast<const byte *>(word.str());
773
382k
  PfxEntry * pptr = pStart[sp];
774
775
1.06M
  while (pptr) {
776
680k
    if (isSubset(pptr->key(),word)) {
777
79.2k
      if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778
79.0k
      pptr = pptr->next_eq;
779
601k
    } else {
780
601k
      pptr = pptr->next_ne;
781
601k
    }
782
680k
  }
783
    
784
382k
  return false;
785
382k
}
786
787
788
// check word for suffixes
789
bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word, 
790
                             CheckInfo & ci, GuessInfo * gi,
791
                             int sfxopts, AffEntry * ppfx) const
792
469k
{
793
469k
  if (word.empty()) return false;
794
795
  // first handle the special case of 0 length suffixes
796
469k
  SfxEntry * se = sStart[0];
797
19.3M
  while (se) {
798
18.9M
    if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799
18.9M
    se = se->next;
800
18.9M
  }
801
  
802
  // now handle the general case
803
469k
  byte sp = *((const byte *)(word + word.size() - 1));
804
469k
  SfxEntry * sptr = sStart[sp];
805
806
3.90M
  while (sptr) {
807
3.43M
    if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
808
2.14M
      if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
809
2.14M
      sptr = sptr->next_eq;
810
2.14M
    } else {
811
1.29M
      sptr = sptr->next_ne;
812
1.29M
    }
813
3.43M
  }
814
    
815
468k
  return false;
816
469k
}
817
818
// check if word with affixes is correctly spelled
819
bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word, 
820
                           CheckInfo & ci, GuessInfo * gi) const
821
382k
{
822
382k
  if (word.empty()) return false;
823
824
  // Deal With Case in a semi-intelligent manner
825
381k
  CasePattern cp = lang->LangImpl::case_pattern(word);
826
381k
  ParmString pword = word;
827
381k
  ParmString sword = word;
828
381k
  CharVector lower;
829
381k
  if (cp == FirstUpper) {
830
11.2k
    lower.append(word, word.size() + 1);
831
11.2k
    lower[0] = lang->to_lower(word[0]);
832
11.2k
    pword = ParmString(lower.data(), lower.size() - 1);
833
370k
  } else if (cp == AllUpper) {
834
55.5k
    lower.resize(word.size() + 1);
835
55.5k
    unsigned int i = 0;
836
1.05M
    for (; i != word.size(); ++i)
837
1.00M
      lower[i] = lang->to_lower(word[i]);
838
55.5k
    lower[i] = '\0';
839
55.5k
    pword = ParmString(lower.data(), lower.size() - 1);
840
55.5k
    sword = pword;
841
55.5k
  }
842
843
  // check all prefixes (also crossed with suffixes if allowed) 
844
381k
  if (prefix_check(linf, pword, ci, gi)) return true;
845
846
  // if still not found check all suffixes
847
381k
  if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
848
849
  // if still not found check again but with the lower case version
850
  // which can make a difference if the entire word matches the cond
851
  // string
852
381k
  if (cp == FirstUpper) {
853
11.2k
    return suffix_check(linf, pword, ci, gi, 0, NULL);
854
369k
  } else {
855
369k
    return false;
856
369k
  }
857
381k
}
858
859
void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
860
1.00k
{
861
1.00k
  LookupInfo li(0, LookupInfo::AlwaysTrue);
862
1.00k
  CheckInfo ci;
863
1.00k
  gi->reset();
864
1.00k
  CasePattern cp = lang->LangImpl::case_pattern(word);
865
1.00k
  if (cp == AllUpper) return;
866
983
  if (cp != FirstUpper)
867
933
    prefix_check(li, word, ci, gi, cross);
868
983
  suffix_check(li, word, ci, gi, 0, NULL);
869
983
}
870
871
WordAff * AffixMgr::expand(ParmString word, ParmString aff, 
872
                           ObjStack & buf, int limit) const
873
279k
{
874
279k
  byte * empty = (byte *)buf.alloc(1);
875
279k
  *empty = 0;
876
877
279k
  byte * suf  = (byte *)buf.alloc(aff.size() + 1); 
878
279k
  byte * suf_e = suf;
879
279k
  byte * csuf = (byte *)buf.alloc(aff.size() + 1); 
880
279k
  byte * csuf_e = csuf;
881
882
279k
  WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
883
279k
  WordAff * cur = head;
884
279k
  cur->word = buf.dup(word);
885
279k
  cur->aff  = suf;
886
887
279k
  for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
888
666k
       c != end; 
889
387k
       ++c) 
890
387k
  {
891
387k
    if (sFlag[*c]) *suf_e++ = *c; 
892
387k
    if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
893
    
894
528k
    for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
895
140k
      SimpleString newword = p->add(word, buf);
896
140k
      if (!newword) continue;
897
60.5k
      cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
898
60.5k
      cur = cur->next;
899
60.5k
      cur->word = newword;
900
60.5k
      cur->aff = p->allow_cross() ? csuf : empty;
901
60.5k
    }
902
387k
  }
903
904
279k
  *suf_e = 0;
905
279k
  *csuf_e = 0;
906
279k
  cur->next = 0;
907
908
279k
  if (limit == 0) return head;
909
910
279k
  WordAff * * end = &cur->next;
911
279k
  WordAff * * very_end = end;
912
279k
  size_t nsuf_s = suf_e - suf + 1;
913
914
618k
  for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
915
339k
    if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
916
339k
    byte * nsuf = (byte *)buf.alloc(nsuf_s);
917
339k
    expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
918
339k
    (*cur)->aff = nsuf;
919
339k
  }
920
921
279k
  return head;
922
279k
}
923
924
WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff, 
925
                                  ObjStack & buf, int limit,
926
                                  byte * new_aff, WordAff * * * l,
927
                                  ParmString orig_word) const
928
339k
{
929
339k
  WordAff * head = 0;
930
339k
  if (l) head = **l;
931
339k
  WordAff * * cur = l ? *l : &head;
932
339k
  bool expanded     = false;
933
339k
  bool not_expanded = false;
934
339k
  if (!orig_word) orig_word = word;
935
936
741k
  while (*aff) {
937
401k
    if ((int)word.size() - max_strip_f[*aff] < limit) {
938
11.2M
      for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
939
10.8M
        SimpleString newword = p->add(word, buf, limit, orig_word);
940
10.8M
        if (!newword) continue;
941
1.96M
        if (newword == EMPTY) {not_expanded = true; continue;}
942
1.96M
        *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
943
1.96M
        (*cur)->word = newword;
944
1.96M
        (*cur)->aff  = (const byte *)EMPTY;
945
1.96M
        cur = &(*cur)->next;
946
1.96M
        expanded = true;
947
1.96M
      }
948
401k
    }
949
401k
    if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
950
401k
    ++aff;
951
401k
  }
952
339k
  *cur = 0;
953
339k
  if (new_aff) *new_aff = 0;
954
339k
  if (l) *l = cur;
955
339k
  return head;
956
339k
}
957
958
CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
959
0
{
960
0
  CheckAffixRes res = InvalidAffix;
961
  
962
0
  for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
963
0
    res = InapplicableAffix;
964
0
    if (p->applicable(word)) return ValidAffix;
965
0
  }
966
967
0
  for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
968
0
    if (res == InvalidAffix) res = InapplicableAffix;
969
0
    if (p->applicable(word)) return ValidAffix;
970
0
  }
971
972
0
  return res;
973
0
}
974
975
976
977
//////////////////////////////////////////////////////////////////////
978
//
979
// LookupInfo
980
//
981
982
int LookupInfo::lookup (ParmString word, const SensitiveCompare * c, 
983
                        char achar, 
984
                        WordEntry & o, GuessInfo * gi) const
985
258k
{
986
258k
  SpellerImpl::WS::const_iterator i = begin;
987
258k
  const char * g = 0;
988
258k
  if (mode == Word) {
989
105k
    do {
990
105k
      (*i)->lookup(word, c, o);
991
109k
      for (;!o.at_end(); o.adv()) {
992
3.71k
        if (TESTAFF(o.aff, achar))
993
60
          return 1;
994
3.65k
        else
995
3.65k
          g = o.word;
996
3.71k
      }
997
105k
      ++i;
998
105k
    } while (i != end);
999
208k
  } else if (mode == Clean) {
1000
208k
    do {
1001
208k
      (*i)->clean_lookup(word, o);
1002
243k
      for (;!o.at_end(); o.adv()) {
1003
36.0k
        if (TESTAFF(o.aff, achar))
1004
822
          return 1;
1005
35.2k
        else
1006
35.2k
          g = o.word;
1007
36.0k
      }
1008
207k
      ++i;
1009
207k
    } while (i != end);
1010
208k
  } else if (gi) {
1011
372
    g = gi->dup(word);
1012
372
  }
1013
257k
  if (gi && g) {
1014
1.91k
    CheckInfo * ci = gi->add();
1015
1.91k
    ci->word = g;
1016
1.91k
    return -1;
1017
1.91k
  }
1018
256k
  return 0;
1019
257k
}
1020
1021
//////////////////////////////////////////////////////////////////////
1022
//
1023
// Affix Entry
1024
//
1025
1026
bool PfxEntry::applicable(SimpleString word) const
1027
0
{
1028
0
  unsigned int cond;
1029
  /* make sure all conditions match */
1030
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1031
0
    const byte * cp = (const byte *) word.str;
1032
0
    for (cond = 0;  cond < conds->num;  cond++) {
1033
0
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1034
0
        break;
1035
0
    }
1036
0
    if (cond >= conds->num) return true;
1037
0
  }
1038
0
  return false;
1039
0
}
1040
1041
// add prefix to this word assuming conditions hold
1042
SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1043
140k
{
1044
140k
  unsigned int cond;
1045
  /* make sure all conditions match */
1046
140k
  if ((word.size > stripl) && (word.size >= conds->num)) {
1047
140k
    const byte * cp = (const byte *) word.str;
1048
172k
    for (cond = 0;  cond < conds->num;  cond++) {
1049
112k
      if ((conds->get(*cp++) & (1 << cond)) == 0)
1050
80.3k
        break;
1051
112k
    }
1052
140k
    if (cond >= conds->num) {
1053
      /* */
1054
60.5k
      int alen = word.size - stripl;
1055
60.5k
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1056
60.5k
      if (appndl) memcpy(newword, appnd, appndl);
1057
60.5k
      memcpy(newword + appndl, word + stripl, alen + 1);
1058
60.5k
      return SimpleString(newword, alen + appndl);
1059
60.5k
    }
1060
140k
  }
1061
80.3k
  return SimpleString();
1062
140k
}
1063
1064
// check if this prefix entry matches 
1065
bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1066
                     ParmString word,
1067
                     CheckInfo & ci, GuessInfo * gi, bool cross) const
1068
79.2k
{
1069
79.2k
  unsigned int    cond; // condition number being examined
1070
79.2k
  unsigned              tmpl;   // length of tmpword
1071
79.2k
  WordEntry             wordinfo;     // hash entry of root word or NULL
1072
79.2k
  byte *  cp;   
1073
79.2k
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1074
1075
  // on entry prefix is 0 length or already matches the beginning of the word.
1076
  // So if the remaining root word has positive length
1077
  // and if there are enough chars in root word and added back strip chars
1078
  // to meet the number of characters conditions, then test it
1079
1080
79.2k
  tmpl = word.size() - appndl;
1081
1082
79.2k
  if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1083
1084
    // generate new root word by removing prefix and adding
1085
    // back any characters that would have been stripped
1086
1087
77.4k
    if (stripl) strcpy (tmpword, strip);
1088
77.4k
    strcpy ((tmpword + stripl), (word + appndl));
1089
1090
    // now make sure all of the conditions on characters
1091
    // are met.  Please see the appendix at the end of
1092
    // this file for more info on exactly what is being
1093
    // tested
1094
1095
77.4k
    cp = (byte *)tmpword;
1096
77.5k
    for (cond = 0;  cond < conds->num;  cond++) {
1097
2.01k
      if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1098
2.01k
    }
1099
1100
    // if all conditions are met then check if resulting
1101
    // root word in the dictionary
1102
1103
77.4k
    if (cond >= conds->num) {
1104
75.5k
      CheckInfo * lci = 0;
1105
75.5k
      CheckInfo * guess = 0;
1106
75.5k
      tmpl += stripl;
1107
1108
75.5k
      int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1109
1110
75.5k
      if (res == 1) {
1111
1112
136
        lci = &ci;
1113
136
        lci->word = wordinfo.word;
1114
136
        goto quit;
1115
        
1116
75.3k
      } else if (res == -1) {
1117
1118
843
        guess = gi->head;
1119
1120
843
      }
1121
      
1122
      // prefix matched but no root word was found 
1123
      // if XPRODUCT is allowed, try again but now 
1124
      // cross checked combined with a suffix
1125
      
1126
75.3k
      if (gi)
1127
6.17k
        lci = gi->head;
1128
      
1129
75.3k
      if (cross && xpflg & XPRODUCT) {
1130
75.3k
        if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl), 
1131
75.3k
                                 ci, gi,
1132
75.3k
                                 XPRODUCT, (AffEntry *)this)) {
1133
0
          lci = &ci;
1134
          
1135
75.3k
        } else if (gi) {
1136
          
1137
6.17k
          CheckInfo * stop = lci;
1138
6.17k
          for (lci = gi->head; 
1139
6.26k
               lci != stop; 
1140
6.17k
               lci = const_cast<CheckInfo *>(lci->next)) 
1141
88
          {
1142
88
            lci->pre_flag = achar;
1143
88
            lci->pre_strip_len = stripl;
1144
88
            lci->pre_add_len = appndl;
1145
88
            lci->pre_add = appnd;
1146
88
          }
1147
          
1148
69.1k
        } else {
1149
          
1150
69.1k
          lci = 0;
1151
          
1152
69.1k
        }
1153
75.3k
      }
1154
    
1155
75.3k
      if (guess)
1156
843
        lci = guess;
1157
      
1158
75.5k
    quit:
1159
75.5k
      if (lci) {
1160
2.95k
        lci->pre_flag = achar;
1161
2.95k
        lci->pre_strip_len = stripl;
1162
2.95k
        lci->pre_add_len = appndl;
1163
2.95k
        lci->pre_add = appnd;
1164
2.95k
      }
1165
75.5k
      if (lci == &ci) return true;
1166
75.5k
    }
1167
77.4k
  }
1168
79.0k
  return false;
1169
79.2k
}
1170
1171
bool SfxEntry::applicable(SimpleString word) const
1172
0
{
1173
0
  int cond;
1174
  /* make sure all conditions match */
1175
0
  if ((word.size > stripl) && (word.size >= conds->num)) {
1176
0
    const byte * cp = (const byte *) (word + word.size);
1177
0
    for (cond = conds->num; --cond >=0; ) {
1178
0
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1179
0
        break;
1180
0
    }
1181
0
    if (cond < 0) return true;
1182
0
  }
1183
0
  return false;
1184
0
}
1185
1186
// add suffix to this word assuming conditions hold
1187
SimpleString SfxEntry::add(SimpleString word, ObjStack & buf, 
1188
                           int limit, SimpleString orig_word) const
1189
10.8M
{
1190
10.8M
  int cond;
1191
  /* make sure all conditions match */
1192
10.8M
  if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1193
8.35M
    const byte * cp = (const byte *) (orig_word + orig_word.size);
1194
15.8M
    for (cond = conds->num; --cond >=0; ) {
1195
13.8M
      if ((conds->get(*--cp) & (1 << cond)) == 0)
1196
6.39M
        break;
1197
13.8M
    }
1198
8.35M
    if (cond < 0) {
1199
1.96M
      int alen = word.size - stripl;
1200
1.96M
      if (alen >= limit) return EMPTY;
1201
      /* we have a match so add suffix */
1202
1.96M
      char * newword = (char *)buf.alloc(alen + appndl + 1);
1203
1.96M
      memcpy(newword, word, alen);
1204
1.96M
      memcpy(newword + alen, appnd, appndl + 1);
1205
1.96M
      return SimpleString(newword, alen + appndl);
1206
1.96M
    }
1207
8.35M
  }
1208
8.88M
  return SimpleString();
1209
10.8M
}
1210
1211
// see if this suffix is present in the word 
1212
bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1213
                     CheckInfo & ci, GuessInfo * gi,
1214
                     int optflags, AffEntry* ppfx)
1215
21.0M
{
1216
21.0M
  unsigned              tmpl;    // length of tmpword 
1217
21.0M
  int     cond;    // condition beng examined
1218
21.0M
  WordEntry             wordinfo;        // hash entry pointer
1219
21.0M
  byte *  cp;
1220
21.0M
  VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1221
21.0M
  PfxEntry* ep = (PfxEntry *) ppfx;
1222
1223
  // if this suffix is being cross checked with a prefix
1224
  // but it does not support cross products skip it
1225
1226
21.0M
  if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1227
98
    return false;
1228
1229
  // upon entry suffix is 0 length or already matches the end of the word.
1230
  // So if the remaining root word has positive length
1231
  // and if there are enough chars in root word and added back strip chars
1232
  // to meet the number of characters conditions, then test it
1233
1234
21.0M
  tmpl = word.size() - appndl;
1235
1236
21.0M
  if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1237
1238
    // generate new root word by removing suffix and adding
1239
    // back any characters that would have been stripped or
1240
    // or null terminating the shorter string
1241
1242
10.8M
    strcpy (tmpword, word);
1243
10.8M
    cp = (byte *)(tmpword + tmpl);
1244
10.8M
    if (stripl) {
1245
10.6M
      strcpy ((char *)cp, strip);
1246
10.6M
      tmpl += stripl;
1247
10.6M
      cp = (byte *)(tmpword + tmpl);
1248
10.6M
    } else *cp = '\0';
1249
1250
    // now make sure all of the conditions on characters
1251
    // are met.  Please see the appendix at the end of
1252
    // this file for more info on exactly what is being
1253
    // tested
1254
1255
23.4M
    for (cond = conds->num;  --cond >= 0; ) {
1256
23.2M
      if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1257
23.2M
    }
1258
1259
    // if all conditions are met then check if resulting
1260
    // root word in the dictionary
1261
1262
10.8M
    if (cond < 0) {
1263
183k
      CheckInfo * lci = 0;
1264
183k
      tmpl += stripl;
1265
183k
      const SensitiveCompare * cmp = 
1266
183k
        optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1267
183k
      int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1268
183k
      if (res == 1
1269
183k
          && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1270
674
      {
1271
674
        lci = &ci;
1272
674
        lci->word = wordinfo.word;
1273
182k
      } else if (res == 1 && gi) {
1274
11
        lci = gi->add();
1275
11
        lci->word = wordinfo.word;
1276
182k
      } else if (res == -1) { // gi must be defined
1277
1.07k
        lci = gi->head;
1278
1.07k
      }
1279
1280
183k
      if (lci) {
1281
1.75k
        lci->suf_flag = achar;
1282
1.75k
        lci->suf_strip_len = stripl;
1283
1.75k
        lci->suf_add_len = appndl;
1284
1.75k
        lci->suf_add = appnd;
1285
1.75k
      }
1286
      
1287
183k
      if (lci == &ci) return true;
1288
183k
    }
1289
10.8M
  }
1290
21.0M
  return false;
1291
21.0M
}
1292
1293
//////////////////////////////////////////////////////////////////////
1294
//
1295
// new_affix_mgr
1296
//
1297
1298
1299
PosibErr<AffixMgr *> new_affix_mgr(ParmString name, 
1300
                                   Conv & iconv,
1301
                                   const Language * lang)
1302
408
{
1303
408
  if (name == "none")
1304
0
    return 0;
1305
  //CERR << "NEW AFFIX MGR\n";
1306
408
  String file;
1307
408
  file += lang->data_dir();
1308
408
  file += '/';
1309
408
  file += lang->name();
1310
408
  file += "_affix.dat";
1311
408
  AffixMgr * affix;
1312
408
  affix = new AffixMgr(lang);
1313
408
  PosibErrBase pe = affix->setup(file, iconv);
1314
408
  if (pe.has_err()) {
1315
0
    delete affix;
1316
0
    return pe;
1317
408
  } else {
1318
408
    return affix;
1319
408
  }
1320
408
}
1321
}
1322
1323
/**************************************************************************
1324
1325
Appendix:  Understanding Affix Code
1326
1327
1328
An affix is either a  prefix or a suffix attached to root words to make 
1329
other words.
1330
1331
Basically a Prefix or a Suffix is set of AffEntry objects
1332
which store information about the prefix or suffix along 
1333
with supporting routines to check if a word has a particular 
1334
prefix or suffix or a combination.
1335
1336
The structure affentry is defined as follows:
1337
1338
struct AffEntry
1339
{
1340
   unsigned char achar;   // char used to represent the affix
1341
   char * strip;          // string to strip before adding affix
1342
   char * appnd;          // the affix string to add
1343
   short  stripl;         // length of the strip string
1344
   short  appndl;         // length of the affix string
1345
   short  numconds;       // the number of conditions that must be met
1346
   short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
1347
   char   conds[SETSIZE]; // array which encodes the conditions to be met
1348
};
1349
1350
1351
Here is a suffix borrowed from the en_US.aff file.  This file 
1352
is whitespace delimited.
1353
1354
SFX D Y 4 
1355
SFX D   0     e          d
1356
SFX D   y     ied        [^aeiou]y
1357
SFX D   0     ed         [^ey]
1358
SFX D   0     ed         [aeiou]y
1359
1360
This information can be interpreted as follows:
1361
1362
In the first line has 4 fields
1363
1364
Field
1365
-----
1366
1     SFX - indicates this is a suffix
1367
2     D   - is the name of the character flag which represents this suffix
1368
3     Y   - indicates it can be combined with prefixes (cross product)
1369
4     4   - indicates that sequence of 4 affentry structures are needed to
1370
               properly store the affix information
1371
1372
The remaining lines describe the unique information for the 4 SfxEntry 
1373
objects that make up this affix.  Each line can be interpreted
1374
as follows: (note fields 1 and 2 are as a check against line 1 info)
1375
1376
Field
1377
-----
1378
1     SFX         - indicates this is a suffix
1379
2     D           - is the name of the character flag for this affix
1380
3     y           - the string of chars to strip off before adding affix
1381
                         (a 0 here indicates the NULL string)
1382
4     ied         - the string of affix characters to add
1383
5     [^aeiou]y   - the conditions which must be met before the affix
1384
                    can be applied
1385
1386
Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1387
there are 2 conditions that must be met.  The first condition is that 
1388
the next to the last character in the word must *NOT* be any of the 
1389
following "a", "e", "i", "o" or "u".  The second condition is that
1390
the last character of the word must end in "y".
1391
1392
So how can we encode this information concisely and be able to 
1393
test for both conditions in a fast manner?  The answer is found
1394
but studying the wonderful ispell code of Geoff Kuenning, et.al. 
1395
(now available under a normal BSD license).
1396
1397
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1398
using a character (cast to an unsigned char) of a string, we have 8 bits
1399
of information we can store about that character.  Specifically we
1400
could use each bit to say if that character is allowed in any of the 
1401
last (or first for prefixes) 8 characters of the word.
1402
1403
Basically, each character at one end of the word (up to the number 
1404
of conditions) is used to index into the conds array and the resulting 
1405
value found there says whether the that character is valid for a 
1406
specific character position in the word.  
1407
1408
For prefixes, it does this by setting bit 0 if that char is valid 
1409
in the first position, bit 1 if valid in the second position, and so on. 
1410
1411
If a bit is not set, then that char is not valid for that position in the
1412
word.
1413
1414
If working with suffixes bit 0 is used for the character closest 
1415
to the front, bit 1 for the next character towards the end, ..., 
1416
with bit numconds-1 representing the last char at the end of the string. 
1417
1418
Note: since entries in the conds[] are 8 bits, only 8 conditions 
1419
(read that only 8 character positions) can be examined at one
1420
end of a word (the beginning for prefixes and the end for suffixes.
1421
1422
So to make this clearer, lets encode the conds array values for the 
1423
first two affentries for the suffix D described earlier.
1424
1425
1426
  For the first affentry:    
1427
     numconds = 1             (only examine the last character)
1428
1429
     conds['e'] =  (1 << 0)   (the word must end in an E)
1430
     all others are all 0
1431
1432
  For the second affentry:
1433
     numconds = 2             (only examine the last two characters)     
1434
1435
     conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1436
         where X is all characters *but* a, e, i, o, or u
1437
         
1438
1439
     conds['y'] = (1 << 1)     (the last char must be a y)
1440
     all other bits for all other entries in the conds array are zero
1441
1442
1443
**************************************************************************/