Coverage Report

Created: 2024-09-08 06:23

/src/aspell/modules/speller/default/readonly_ws.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of The New Aspell
2
// Copyright (C) 2000-2001,2011 by Kevin Atkinson under the GNU LGPL
3
// license version 2.0 or 2.1.  You should have received a copy of the
4
// LGPL license along with this library if you did not you can find it
5
// at http://www.gnu.org/.
6
7
// Aspell's main word list is laid out as follows:
8
//
9
// * header
10
// * jump table for editdist 1
11
// * jump table for editdist 2
12
// * data block
13
// * hash table
14
15
// data block laid out as follows:
16
//
17
// Words:
18
//   (<8 bit frequency><8 bit: flags><8 bit: offset to next word>
19
//      <8 bit: word size><word><null>
20
//      [<affix info><null>][<category info><null>])+
21
// Words with soundslike:
22
//   (<8 bit: offset to next item><8 bit: soundslike size><soundslike>
23
//      <words with that soundlike>)+
24
// Flags are mapped as follows:
25
//   bits 0-3: word info
26
//   bit    4: duplicate flag
27
//   bit    5: <unused>
28
//   bit    6: have affix info
29
//   bit    7: have compound info
30
31
#include <utility>
32
using std::pair;
33
34
#include <string.h>
35
#include <stdio.h>
36
//#include <errno.h>
37
38
#include "settings.h"
39
40
#include "block_vector.hpp"
41
#include "config.hpp"
42
#include "data.hpp"
43
#include "data_util.hpp"
44
#include "errors.hpp"
45
#include "file_util.hpp"
46
#include "fstream.hpp"
47
#include "language.hpp"
48
#include "stack_ptr.hpp"
49
#include "objstack.hpp"
50
#include "vector.hpp"
51
#include "vector_hash-t.hpp"
52
#include "check_list.hpp"
53
#include "lsort.hpp"
54
55
#include "iostream.hpp"
56
57
#include "gettext.h"
58
59
typedef unsigned int   u32int;
60
static const u32int u32int_max = (u32int)-1;
61
typedef unsigned short u16int;
62
typedef unsigned char byte;
63
64
#ifdef USE_32_BIT_HASH_FUN
65
typedef u32int hash_int_t;
66
#else
67
typedef size_t hash_int_t;
68
#endif
69
70
#ifdef HAVE_MMAP 
71
72
// POSIX headers
73
#include <fcntl.h>
74
#include <unistd.h>
75
#include <sys/mman.h>
76
77
#endif
78
79
#ifndef MAP_FAILED 
80
#define MAP_FAILED (-1)
81
#endif
82
83
#ifdef HAVE_MMAP
84
85
static inline char * mmap_open(unsigned int block_size, 
86
             FStream & f, 
87
             unsigned int offset) 
88
1.44k
{
89
1.44k
  f.flush();
90
1.44k
  int fd = f.file_no();
91
1.44k
  return static_cast<char *>
92
1.44k
    (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset));
93
1.44k
}
94
95
static inline void mmap_free(char * block, unsigned int size) 
96
1.44k
{
97
1.44k
  munmap(block, size);
98
1.44k
}
99
100
#else
101
102
static inline char * mmap_open(unsigned int, 
103
             FStream & f, 
104
             unsigned int) 
105
{
106
  return reinterpret_cast<char *>(MAP_FAILED);
107
}
108
109
static inline void mmap_free(char *, unsigned int) 
110
{
111
  abort();
112
}
113
114
#endif
115
116
static byte HAVE_AFFIX_FLAG = 1 << 7;
117
static byte HAVE_CATEGORY_FLAG = 1 << 6;
118
119
static byte DUPLICATE_FLAG = 1 << 4;
120
// this flag is set when there is is more than one word for a
121
// particulear "clean" word such as "jello" "Jello".  It is set on all
122
// but the last word of the group.  I.e., if it is set, then the next
123
// word when converted to its "clean" form equals the same value.
124
125
static byte WORD_INFO_MASK = 0x0F;
126
127
static const int FREQUENCY_INFO_O = 4;
128
static const int FLAGS_O = 3;
129
static const int NEXT_O = 2;
130
static const int WORD_SIZE_O = 1;
131
132
33.1M
static inline int get_word_size(const char * d) {
133
33.1M
  return *reinterpret_cast<const byte *>(d - WORD_SIZE_O);
134
33.1M
}
135
136
21.7M
static inline byte get_flags(const char * d) {
137
21.7M
  return *reinterpret_cast<const byte *>(d - FLAGS_O);
138
21.7M
}
139
140
26.3M
static inline byte get_offset(const char * d) {
141
26.3M
  return *reinterpret_cast<const byte *>(d - NEXT_O);
142
26.3M
}
143
144
35.5M
static inline const char * get_next(const char * d) {
145
35.5M
  return d + *reinterpret_cast<const byte *>(d - NEXT_O);
146
35.5M
}
147
148
672k
static inline const char * get_sl_words_begin(const char * d) {
149
672k
  return d + *reinterpret_cast<const byte *>(d - WORD_SIZE_O) + 4;
150
  // FIXME: This isn't right when frequency info is stored in the table
151
672k
}
152
153
// get_next might go past the end so don't JUST compare
154
// for equality.  Ie use while (cur < end) not (cur != end)
155
672k
static inline const char * get_sl_words_end(const char * d) {
156
672k
  return get_next(d) - 3;
157
672k
}
158
159
10.7M
static inline const char * get_affix(const char * d) {
160
10.7M
  int word_size = get_word_size(d);
161
10.7M
  if (get_flags(d) & HAVE_AFFIX_FLAG) 
162
1.50M
    return d + word_size +  1;
163
9.19M
  else
164
9.19M
    return d + word_size;
165
10.7M
}
166
167
0
static inline const char * get_category(const char * d) {
168
0
  int word_size = get_word_size(d);
169
0
  if (get_flags(d) & (HAVE_AFFIX_FLAG | HAVE_CATEGORY_FLAG)) 
170
0
    return d + strlen(d + word_size +  1) + 1;
171
0
  else if (get_flags(d) & HAVE_CATEGORY_FLAG)
172
0
    return d + word_size +  1;
173
0
  else
174
0
    return d + word_size;
175
0
}
176
177
317k
static inline bool duplicate_flag(const char * d) {
178
317k
  return get_flags(d) & DUPLICATE_FLAG;
179
317k
}
180
181
namespace {
182
183
  using namespace aspeller;
184
185
  /////////////////////////////////////////////////////////////////////
186
  // 
187
  //  ReadOnlyDict
188
  //
189
    
190
  struct Jump
191
  {
192
    char   sl[4];
193
    u32int loc;
194
0
    Jump() {memset(this, 0, sizeof(Jump));}
195
  };
196
  
197
  class ReadOnlyDict : public Dictionary
198
  {
199
200
  public: //but don't use
201
202
    struct WordLookupParms {
203
      const char * block_begin;
204
1.44k
      WordLookupParms() {}
205
      typedef BlockVector<const u32int> Vector;
206
      typedef u32int                    Value;
207
      typedef const char *              Key;
208
      static const bool is_multi = false;
209
29.5M
      Key key(Value v) const {return block_begin + v;}
210
      InsensitiveHash<hash_int_t> hash;
211
      InsensitiveEqual equal;
212
44.5M
      bool is_nonexistent(Value v) const {return v == u32int_max;}
213
0
      void make_nonexistent(const Value & v) const {abort();}
214
    };
215
    typedef VectorHashTable<WordLookupParms> WordLookup;
216
217
  public: // but don't use
218
      
219
    char *           block;
220
    u32int           block_size;
221
    char *           mmaped_block;
222
    u32int           mmaped_size;
223
    const Jump * jump1;
224
    const Jump * jump2;
225
    WordLookup       word_lookup;
226
    const char *     word_block;
227
    const char *     first_word;
228
    
229
    ReadOnlyDict(const ReadOnlyDict&);
230
    ReadOnlyDict& operator= (const ReadOnlyDict&);
231
232
    struct Elements;
233
    struct SoundslikeElements;
234
235
  public:
236
    WordEntryEnumeration * detailed_elements() const;
237
    Size      size()     const;
238
    bool      empty()    const;
239
240
    ReadOnlyDict() 
241
      : Dictionary(basic_dict, "ReadOnlyDict")
242
1.44k
    {
243
1.44k
      block = 0;
244
1.44k
    }
245
246
1.44k
    ~ReadOnlyDict() {
247
1.44k
      if (block != 0) {
248
1.44k
  if (mmaped_block)
249
1.44k
    mmap_free(mmaped_block, mmaped_size);
250
0
  else
251
0
    free(block);
252
1.44k
      }
253
1.44k
    }
254
    
255
    PosibErr<void> load(ParmString, Config &, DictList *, SpellerImpl *);
256
    PosibErr<void> check_hash_fun() const;
257
    void low_level_dump() const;
258
259
    bool lookup(ParmString word, const SensitiveCompare *, WordEntry &) const;
260
261
    bool clean_lookup(ParmString, WordEntry &) const;
262
263
    bool soundslike_lookup(const WordEntry &, WordEntry &) const;
264
    bool soundslike_lookup(ParmString, WordEntry &) const;
265
    
266
    SoundslikeEnumeration * soundslike_elements() const;
267
268
  };
269
270
10.7M
  static inline void convert(const char * w, WordEntry & o) {
271
10.7M
    o.what = WordEntry::Word;
272
10.7M
    o.word = w;
273
10.7M
    o.aff  = get_affix(w);
274
10.7M
    o.word_size = get_word_size(w);
275
10.7M
    o.word_info = get_flags(w) & WORD_INFO_MASK;
276
10.7M
  }
277
    
278
  //
279
  //  
280
  //
281
282
  struct ReadOnlyDict::Elements : public WordEntryEnumeration 
283
  {
284
    const char * w;
285
    WordEntry wi;
286
0
    Elements(const char * w0) : w(w0) {wi.what = WordEntry::Word;}
287
0
    WordEntry * next() {
288
0
      if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
289
                                      //        when freq info is used
290
0
      if (get_offset(w) == 0) return 0;
291
0
      convert(w, wi);
292
0
      w = get_next(w);
293
0
      return &wi;
294
0
    }
295
0
    bool at_end() const {return get_offset(w) == 0;}
296
0
    WordEntryEnumeration * clone() const {return new Elements(*this);}
297
0
    void assign (const WordEntryEnumeration * other) {
298
0
      *this = *static_cast<const Elements *>(other);}
299
  };
300
301
0
  WordEntryEnumeration * ReadOnlyDict::detailed_elements() const {
302
0
    return new Elements(first_word);
303
0
  }
304
305
0
  void ReadOnlyDict::low_level_dump() const {
306
0
    bool next_dup = false;
307
0
    const char * w = first_word;
308
0
    for (;;) {
309
0
      if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
310
0
                                      //        when freq info is used
311
0
      if (get_offset(w) == 0) break;
312
0
      
313
0
      const char * aff = get_affix(w);
314
0
      byte flags = get_flags(w);
315
0
      byte word_info = flags & WORD_INFO_MASK;
316
0
      byte offset = get_offset(w);
317
0
      int size = get_word_size(w);
318
0
      if (next_dup) printf("\\");
319
0
      printf("%s", w);
320
0
      if (flags & HAVE_AFFIX_FLAG) printf("/%s", aff);
321
0
      if (word_info) printf(" [WI: %d]", word_info);
322
0
      //if (flags & DUPLICATE_FLAG) printf(" [NEXT DUP]");
323
0
      const char * p = w;
324
0
      WordLookup::const_iterator i = word_lookup.find(w);
325
0
      if (!next_dup) {
326
0
        if (i == word_lookup.end())
327
0
          printf(" <BAD HASH>");
328
0
        else if (word_block + *i != w) {
329
0
          printf(" <BAD HASH, got %s>", word_block + *i);
330
0
        }
331
0
        else 
332
0
          printf(" <hash ok>");
333
0
      }
334
0
      printf("\n");
335
0
      String buf;
336
0
      if (flags & DUPLICATE_FLAG) next_dup = true;
337
0
      else next_dup = false;
338
0
      w = get_next(w);
339
0
    }
340
0
  }
341
  
342
1.44k
  PosibErr<void> ReadOnlyDict::check_hash_fun() const {
343
1.44k
    const char * w = first_word;
344
57.5k
    for (;;) {
345
57.5k
      if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
346
                                      //        when freq info is used
347
57.5k
      if (get_offset(w) == 0) break;
348
57.5k
      if (get_word_size(w) >= 12) {
349
2.87k
        const char * p = w;
350
2.87k
        int clean_size = 0;
351
36.6k
        for (;;) {
352
36.6k
          if (!*p) goto next; // reached end before clean_size was at
353
                              // least 12, thus skip
354
35.2k
          if (lang()->to_clean(*p)) ++clean_size;
355
35.2k
          if (clean_size >= 12) goto clean_size_ok;
356
33.7k
          ++p;
357
33.7k
        }
358
1.44k
      clean_size_ok:
359
1.44k
        WordLookup::const_iterator i = word_lookup.find(w);
360
1.44k
        if (i == word_lookup.end() || word_block + *i != w)
361
0
          return make_err(bad_file_format, file_name(), 
362
0
                          _("Incompatible hash function."));
363
1.44k
        else
364
1.44k
          return no_err;
365
1.44k
      }
366
56.0k
    next:
367
67.5k
      while (get_flags(w) & DUPLICATE_FLAG)
368
11.5k
        w = get_next(w);
369
56.0k
      w = get_next(w);
370
56.0k
    }
371
0
    return no_err;
372
1.44k
  }
373
374
3.54k
  ReadOnlyDict::Size ReadOnlyDict::size() const {
375
3.54k
    return word_lookup.size();
376
3.54k
  }
377
  
378
0
  bool ReadOnlyDict::empty() const {
379
0
    return word_lookup.empty();
380
0
  }
381
382
  static const char * const cur_check_word = "aspell default speller rowl 1.10";
383
384
  struct DataHead {
385
    // all sizes except the last four must to divisible by:
386
    static const unsigned int align = 16;
387
    char check_word[64];
388
    u32int endian_check; // = 12345678
389
    char lang_hash[16];
390
391
    u32int head_size;
392
    u32int block_size;
393
    u32int jump1_offset;
394
    u32int jump2_offset;
395
    u32int word_offset;
396
    u32int hash_offset;
397
398
    u32int word_count;
399
    u32int word_buckets;
400
    u32int soundslike_count;
401
402
    u32int dict_name_size;
403
    u32int lang_name_size;
404
    u32int soundslike_name_size;
405
    u32int soundslike_version_size;
406
407
    u32int first_word_offset; // from word block
408
409
    byte affix_info; // 0 = none, 1 = partially expanded, 2 = full
410
    byte invisible_soundslike;
411
    byte soundslike_root_only;
412
    byte compound_info; //
413
    byte freq_info;
414
  };
415
416
  PosibErr<void> ReadOnlyDict::load(ParmString f0, Config & config, 
417
                                    DictList *, SpellerImpl *)
418
1.44k
  {
419
1.44k
    set_file_name(f0);
420
1.44k
    const char * fn = file_name();
421
422
1.44k
    FStream f;
423
1.44k
    RET_ON_ERR(f.open(fn, "rb"));
424
425
1.44k
    DataHead data_head;
426
427
1.44k
    f.read(&data_head, sizeof(DataHead));
428
429
#if 0
430
    COUT << "Head Size: " << data_head.head_size << "\n";
431
    COUT << "Data Block Size: " << data_head.data_block_size << "\n";
432
    COUT << "Hash Block Size: " << data_head.hash_block_size << "\n";
433
    COUT << "Total Block Size: " << data_head.total_block_size << "\n";
434
#endif
435
436
1.44k
    if (strcmp(data_head.check_word, cur_check_word) != 0)
437
0
      return make_err(bad_file_format, fn);
438
439
1.44k
    if (data_head.endian_check != 12345678)
440
0
      return make_err(bad_file_format, fn, _("Wrong endian order."));
441
442
1.44k
    CharVector word;
443
444
1.44k
    word.resize(data_head.dict_name_size);
445
1.44k
    f.read(word.data(), data_head.dict_name_size);
446
447
1.44k
    word.resize(data_head.lang_name_size);
448
1.44k
    f.read(word.data(), data_head.lang_name_size);
449
450
1.44k
    PosibErr<void> pe = set_check_lang(word.data(),config);
451
1.44k
    if (pe.has_err()) {
452
4
      if (pe.prvw_err()->is_a(language_related_error))
453
0
        return pe.with_file(fn);
454
4
      else
455
4
        return pe;
456
4
    }
457
458
1.44k
    if (data_head.soundslike_name_size != 0) {
459
1.44k
      word.resize(data_head.soundslike_name_size);
460
1.44k
      f.read(word.data(), data_head.soundslike_name_size);
461
462
1.44k
      if (strcmp(word.data(), lang()->soundslike_name()) != 0)
463
0
        return make_err(bad_file_format, fn, _("Wrong soundslike."));
464
465
1.44k
      word.resize(data_head.soundslike_version_size);
466
1.44k
      f.read(word.data(), data_head.soundslike_version_size);
467
468
1.44k
      if (strcmp(word.data(), lang()->soundslike_version()) != 0)
469
0
        return make_err(bad_file_format, fn, _("Wrong soundslike version."));
470
1.44k
    }
471
472
1.44k
    invisible_soundslike = data_head.invisible_soundslike;
473
1.44k
    soundslike_root_only = data_head.soundslike_root_only;
474
475
1.44k
    affix_compressed = data_head.affix_info;
476
477
1.44k
    block_size = data_head.block_size;
478
1.44k
    int offset = data_head.head_size;
479
1.44k
    mmaped_block = mmap_open(block_size + offset, f, 0);
480
1.44k
    if( mmaped_block != (char *)MAP_FAILED) {
481
1.44k
      block = mmaped_block + offset;
482
1.44k
      mmaped_size = block_size + offset;
483
1.44k
    } else {
484
0
      mmaped_block = 0;
485
0
      block = (char *)malloc(block_size);
486
0
      f.seek(data_head.head_size);
487
0
      f.read(block, block_size);
488
0
    }
489
490
1.44k
    if (data_head.jump2_offset) {
491
1.44k
      fast_scan = true;
492
1.44k
      jump1 = reinterpret_cast<const Jump *>(block + data_head.jump1_offset);
493
1.44k
      jump2 = reinterpret_cast<const Jump *>(block + data_head.jump2_offset);
494
1.44k
    } else {
495
0
      jump1 = jump2 = 0;
496
0
    }
497
498
1.44k
    word_block = block + data_head.word_offset;
499
1.44k
    first_word = word_block + data_head.first_word_offset;
500
501
1.44k
    word_lookup.parms().block_begin = word_block;
502
1.44k
    word_lookup.parms().hash .lang     = lang();
503
1.44k
    word_lookup.parms().equal.cmp.lang = lang();
504
1.44k
    const u32int * begin = reinterpret_cast<const u32int *>
505
1.44k
      (block + data_head.hash_offset);
506
1.44k
    word_lookup.vector().set(begin, begin + data_head.word_buckets);
507
1.44k
    word_lookup.set_size(data_head.word_count);
508
    
509
    //low_level_dump();
510
1.44k
    RET_ON_ERR(check_hash_fun());
511
    
512
1.44k
    return no_err;
513
1.44k
  }
514
515
  void lookup_adv(WordEntry * wi);
516
517
  static inline void prep_next(WordEntry * wi, 
518
                               const char * w,
519
                               const SensitiveCompare * c, 
520
                               const char * orig) 
521
17.8k
  {
522
17.9k
  loop:
523
17.9k
    if (!duplicate_flag(w)) return;
524
1.13k
    w = get_next(w);
525
1.13k
    if (!(*c)(orig, w)) goto loop;
526
1.01k
    wi->intr[0] = (void *)w;
527
1.01k
    wi->intr[1] = (void *)c;
528
1.01k
    wi->intr[2] = (void *)orig;
529
1.01k
    wi->adv_ = lookup_adv;
530
1.01k
  }
531
532
  void lookup_adv(WordEntry * wi) 
533
167
  {
534
167
    const char * w = (const char *)wi->intr[0];
535
167
    const SensitiveCompare * c = (const SensitiveCompare *)wi->intr[1];
536
167
    const char * orig = (const char *)wi->intr[2];
537
167
    convert(w,*wi);
538
167
    wi->adv_ = 0;
539
167
    prep_next(wi, w, c, orig);
540
167
  }
541
542
  bool ReadOnlyDict::lookup(ParmString word, const SensitiveCompare * c,
543
                            WordEntry & o) const 
544
689k
  {
545
689k
    o.clear();
546
689k
    WordLookup::const_iterator i = word_lookup.find(word);
547
689k
    if (i == word_lookup.end()) return false;
548
56.8k
    const char * w = word_block + *i;
549
93.7k
    for (;;) {
550
93.7k
      if ((*c)(word, w)) {
551
17.6k
        convert(w,o);
552
17.6k
        prep_next(&o, w, c, word);
553
17.6k
        return true;
554
17.6k
      }
555
76.0k
      if (!duplicate_flag(w)) break;
556
36.9k
      w = get_next(w);
557
36.9k
    }
558
39.1k
    return false;
559
56.8k
  }
560
561
  struct ReadOnlyDict::SoundslikeElements : public SoundslikeEnumeration
562
  {
563
    WordEntry data;
564
    const ReadOnlyDict * obj;
565
    const Jump * jump1;
566
    const Jump * jump2;
567
    const char * cur;
568
    const char * prev;
569
    int level;
570
    bool invisible_soundslike;
571
572
    WordEntry * next(int stopped_at);
573
574
    SoundslikeElements(const ReadOnlyDict * o)
575
      : obj(o), jump1(obj->jump1), jump2(obj->jump2), cur(0), 
576
17.8k
        level(1), invisible_soundslike(o->invisible_soundslike) {
577
17.8k
      data.what = o->invisible_soundslike ? WordEntry::Word : WordEntry::Soundslike;}
578
  };
579
580
23.5M
  WordEntry * ReadOnlyDict::SoundslikeElements::next(int stopped_at) {
581
582
    //CERR << level << ":" << stopped_at << "  :";
583
    //CERR << jump1->sl << ":" << jump2->sl << "\n";
584
585
38.1M
  loop:
586
587
38.1M
    const char * tmp = cur;
588
38.1M
    const char * p;
589
590
38.1M
    if (level == 1 && stopped_at < 2) {
591
592
2.83M
      ++jump1;
593
2.83M
      tmp = jump1->sl;
594
2.83M
      goto jquit;
595
    
596
35.2M
    } else if (level == 2 && stopped_at < 3) {
597
598
6.40M
      ++jump2;
599
6.40M
      if (jump2[-1].sl[1] != jump2[0].sl[1]) {
600
779k
        ++jump1;
601
779k
        level = 1;
602
779k
        tmp = jump1->sl;
603
5.62M
      } else {
604
5.62M
        tmp = jump2->sl;
605
5.62M
      }
606
6.40M
      goto jquit;
607
      
608
28.8M
    } else if (level == 1) {
609
610
999k
      level = 2;
611
999k
      jump2 = obj->jump2 + jump1->loc;
612
999k
      tmp = jump2->sl;
613
999k
      goto jquit;
614
615
27.8M
    } else if (level == 2) {
616
617
1.63M
      tmp = cur = obj->word_block + jump2->loc;
618
1.63M
      level = 3;
619
620
26.2M
    } else if (get_offset(cur) == 0) {
621
622
1.63M
      level = 2;
623
1.63M
      ++jump2;
624
1.63M
      if (jump2[-1].sl[1] != jump2[0].sl[1]) {
625
219k
        level = 1;
626
219k
        ++jump1;
627
219k
        tmp = jump1->sl;
628
1.41M
      } else {
629
1.41M
        tmp = jump2->sl;
630
1.41M
      }
631
1.63M
      goto jquit;
632
633
1.63M
    } 
634
635
26.2M
    cur = get_next(cur); // this will be the NEXT item looked at
636
637
26.2M
    p = prev;
638
26.2M
    prev = tmp;
639
26.2M
    if (p) {
640
      // PRECOND:
641
      // unless stopped_at >= LARGE_NUM
642
      //     strlen(p) >= stopped_at
643
      //     (stopped_at >= 3) implies 
644
      //         strncmp(p, tmp, 3) == 0 if !invisible_soundslike
645
      //         strncmp(to_sl(p), to_sl(tmp), 3) == 0 if invisible_soundslike
646
24.5M
      if (stopped_at == 3) {
647
16.6M
        if (p[3] == tmp[3]) goto loop;
648
16.6M
      } else if (stopped_at == 4) {
649
2.43M
        if (p[3] == tmp[3] && tmp[3] &&
650
2.43M
            p[4] == tmp[4]) goto loop;
651
5.55M
      } else if (stopped_at == 5) {
652
368k
        if (p[3] == tmp[3] && tmp[3] &&
653
368k
            p[4] == tmp[4] && tmp[4] &&
654
368k
            p[5] == tmp[5]) goto loop;
655
368k
      }
656
24.5M
    }
657
    
658
11.7M
    data.word = tmp;
659
11.7M
    data.word_size = get_word_size(tmp);
660
11.7M
    if (invisible_soundslike) {
661
1.31M
      convert(tmp, data);
662
1.31M
    } 
663
11.7M
    data.intr[0] = (void *)tmp;
664
    
665
11.7M
    return &data;
666
667
11.8M
  jquit:
668
11.8M
    prev = 0;
669
11.8M
    if (!*tmp) return 0;
670
11.8M
    data.word = tmp;
671
11.8M
    data.word_size = !tmp[1] ? 1 : !tmp[2] ? 2 : 3;
672
11.8M
    data.intr[0] = 0;
673
11.8M
    if (invisible_soundslike) {
674
684k
      data.what = WordEntry::Clean;
675
684k
      data.aff  = 0;
676
684k
    }
677
11.8M
    return &data;
678
11.8M
  }
679
680
17.8k
  SoundslikeEnumeration * ReadOnlyDict::soundslike_elements() const {
681
682
17.8k
    return new SoundslikeElements(this);
683
684
17.8k
  }
685
    
686
  static void soundslike_next(WordEntry * w)
687
8.52M
  {
688
8.52M
    const char * cur = (const char *)(w->intr[0]);
689
8.52M
    const char * end = (const char *)(w->intr[1]);
690
8.52M
    convert(cur, *w);
691
8.52M
    cur = get_next(cur);
692
8.52M
    w->intr[0] = (void *)cur;
693
8.52M
    if (cur >= end) w->adv_ = 0;
694
8.52M
  }
695
696
  static void clean_lookup_adv(WordEntry * wi) 
697
54.9k
  {
698
54.9k
    const char * w = wi->word;
699
54.9k
    w = get_next(w);
700
54.9k
    convert(w,*wi);
701
54.9k
    if (!duplicate_flag(w)) wi->adv_ = 0;
702
54.9k
  }
703
704
  bool ReadOnlyDict::clean_lookup(ParmString sl, WordEntry & o) const
705
6.93M
  {
706
6.93M
    o.clear();
707
6.93M
    WordLookup::const_iterator i = word_lookup.find(sl);
708
6.93M
    if (i == word_lookup.end()) return false;
709
168k
    const char * w = word_block + *i;
710
168k
    convert(w, o);
711
168k
    if (duplicate_flag(w)) o.adv_ = clean_lookup_adv;
712
168k
    return true;
713
6.93M
  }
714
    
715
  bool ReadOnlyDict::soundslike_lookup(const WordEntry & s, WordEntry & w) const 
716
1.93M
  {
717
1.93M
    if (s.intr[0] == 0) {
718
719
640k
      return false;
720
721
1.29M
    } else if (!invisible_soundslike) {
722
      
723
672k
      w.clear();
724
672k
      w.what = WordEntry::Word;
725
672k
      w.intr[0] = (void *)get_sl_words_begin(s.word);
726
672k
      w.intr[1] = (void *)get_sl_words_end(s.word);
727
672k
      w.adv_ = soundslike_next;
728
672k
      soundslike_next(&w);
729
672k
      return true;
730
      
731
672k
    } else {
732
733
623k
      w.clear();
734
623k
      w.what = WordEntry::Word;
735
623k
      convert(s.word, w);
736
623k
      return true;
737
738
623k
    }
739
1.93M
  }
740
741
  bool ReadOnlyDict::soundslike_lookup(ParmString s, WordEntry & w) const 
742
0
  {
743
0
    if (invisible_soundslike) {
744
0
      return ReadOnlyDict::clean_lookup(s,w);
745
0
    } else {
746
0
      return false;
747
0
    }
748
0
  }
749
750
}  
751
752
namespace aspeller {
753
754
1.44k
  Dictionary * new_default_readonly_dict() {
755
1.44k
    return new ReadOnlyDict();
756
1.44k
  }
757
  
758
}
759
760
namespace {
761
762
  // Possible:
763
  //   No Affix Compression:
764
  //     no soundslike
765
  //     invisible soundslike
766
  //     with soundslike
767
  //   Affix Compression:
768
  //     group by root:
769
  //       no soundslike
770
  //       invisible soundslike
771
  //       with soundslike
772
  //     expand prefix:  
773
  //       no soundslike
774
  //       invisible soundslike
775
776
  using namespace aspeller;
777
778
  struct WordData {
779
    static const unsigned struct_size;
780
    WordData * next;
781
    char * sl;
782
    char * aff;
783
    byte word_size;
784
    byte sl_size;
785
    byte data_size;
786
    byte flags;
787
    char word[1];
788
  };
789
790
  const unsigned WordData::struct_size = sizeof(WordData) - 1;
791
  
792
793
  struct SoundslikeLess {
794
    InsensitiveCompare icomp;
795
0
    SoundslikeLess(const Language * l) : icomp(l) {}
796
0
    bool operator() (WordData * x, WordData * y) const {
797
0
      int res = strcmp(x->sl, y->sl);
798
0
      if (res != 0) return res < 0;
799
0
      res = icomp(x->word, y->word);
800
0
      if (res != 0) return res < 0;
801
0
      return strcmp(x->word, y->word) < 0;
802
0
    }
803
  };
804
805
  struct WordLookupParms {
806
    const char * block_begin;
807
0
    WordLookupParms() {}
808
    typedef acommon::Vector<u32int> Vector;
809
    typedef u32int              Value;
810
    typedef const char *        Key;
811
    static const bool is_multi = false;
812
0
    Key key(Value v) const {return block_begin + v;}
813
    InsensitiveHash<hash_int_t> hash;
814
    InsensitiveEqual equal;
815
0
    bool is_nonexistent(Value v) const {return v == u32int_max;}
816
0
    void make_nonexistent(Value & v) const {v = u32int_max;}
817
  };
818
  typedef VectorHashTable<WordLookupParms> WordLookup;
819
820
0
  static inline unsigned int round_up(unsigned int i, unsigned int size) {
821
0
    return ((i + size - 1)/size)*size;
822
0
  }
823
824
0
  static void advance_file(FStream & out, int pos) {
825
0
    int diff = pos - out.tell();
826
0
    assert(diff >= 0);
827
0
    for(; diff != 0; --diff)
828
0
      out << '\0';
829
0
  }
830
831
  PosibErr<void> create (StringEnumeration * els,
832
       const Language & lang,
833
                         Config & config) 
834
0
  {
835
0
    assert(sizeof(u16int) == 2);
836
0
    assert(sizeof(u32int) == 4);
837
838
0
    bool full_soundslike = !(strcmp(lang.soundslike_name(), "none") == 0 ||
839
0
                             strcmp(lang.soundslike_name(), "stripped") == 0 ||
840
0
                             strcmp(lang.soundslike_name(), "simple") == 0);
841
842
0
    bool affix_compress = (lang.affix() && 
843
0
                           config.retrieve_bool("affix-compress"));
844
845
0
    bool partially_expand = (affix_compress &&
846
0
                             !full_soundslike &&
847
0
                             config.retrieve_bool("partially-expand"));
848
849
0
    bool invisible_soundslike = false;
850
0
    if (partially_expand)
851
0
      invisible_soundslike = true;
852
0
    else if (config.have("invisible-soundslike"))
853
0
      invisible_soundslike = config.retrieve_bool("invisible-soundslike");
854
0
    else if (!full_soundslike)
855
0
      invisible_soundslike = true;
856
857
0
    ConvEC iconv;
858
0
    if (!config.have("norm-strict"))
859
0
      config.replace("norm-strict", "true");
860
0
    if (config.have("encoding")) {
861
0
      String enc = config.retrieve("encoding");
862
0
      RET_ON_ERR(iconv.setup(config, enc, lang.charmap(), NormFrom));
863
0
    } else {
864
0
      RET_ON_ERR(iconv.setup(config, lang.data_encoding(), lang.charmap(), NormFrom));
865
0
    }
866
867
0
    String base = config.retrieve("master-path");
868
869
0
    DataHead data_head;
870
0
    memset(&data_head, 0, sizeof(data_head));
871
0
    strcpy(data_head.check_word, cur_check_word);
872
873
0
    data_head.endian_check = 12345678;
874
875
0
    data_head.dict_name_size = 1;
876
0
    data_head.lang_name_size = strlen(lang.name()) + 1;
877
0
    data_head.soundslike_name_size    = strlen(lang.soundslike_name()) + 1;
878
0
    data_head.soundslike_version_size = strlen(lang.soundslike_version()) + 1;
879
0
    data_head.head_size  = sizeof(DataHead);
880
0
    data_head.head_size += data_head.dict_name_size;
881
0
    data_head.head_size += data_head.lang_name_size;
882
0
    data_head.head_size += data_head.soundslike_name_size;
883
0
    data_head.head_size += data_head.soundslike_version_size;
884
0
    data_head.head_size  = round_up(data_head.head_size, DataHead::align);
885
886
0
    data_head.affix_info = affix_compress ? partially_expand ? 1 : 2 : 0;
887
0
    data_head.invisible_soundslike = invisible_soundslike;
888
0
    data_head.soundslike_root_only = affix_compress  && !partially_expand ? 1 : 0;
889
890
#if 0
891
    CERR.printl("FLAGS:  ");
892
    if (full_soundslike) CERR.printl("  full soundslike");
893
    if (invisible_soundslike) CERR.printl("  invisible soundslike");
894
    if (data_head.soundslike_root_only) CERR.printl("  soundslike root only");
895
    if (affix_compress) CERR.printl("  affix compress");
896
    if (partially_expand) CERR.printl("  partially expand");
897
    CERR.printl("---");
898
#endif
899
    
900
0
    String temp;
901
902
0
    int num_entries = 0;
903
0
    int uniq_entries = 0;
904
    
905
0
    ObjStack buf(16*1024);
906
0
    String sl_buf;
907
908
0
    WordData * first = 0;
909
910
    //
911
    // Read in Wordlist
912
    //
913
0
    {
914
0
      WordListIterator wl_itr(els, &lang, config.retrieve_bool("warn") ? &CERR : 0);
915
0
      wl_itr.init(config);
916
0
      ObjStack exp_buf;
917
0
      WordAff * exp_list;
918
0
      WordAff single;
919
0
      single.next = 0;
920
0
      Vector<WordAff> af_list;
921
0
      WordData * * prev = &first;
922
923
0
      for (;;) {
924
925
0
        PosibErr<bool> pe = wl_itr.adv();
926
0
        if (pe.has_err()) return pe;
927
0
        if (!pe.data) break;
928
929
0
        const char * w = wl_itr->word.str;
930
0
        unsigned int s = wl_itr->word.size;
931
932
0
        const char * affixes = wl_itr->aff.str;
933
934
0
        if (*affixes && !lang.affix())
935
0
          return make_err(other_error, 
936
0
                          _("Affix flags found in word but no affix file given."));
937
938
0
        if (*affixes && !affix_compress) {
939
0
          exp_buf.reset();
940
0
          exp_list = lang.affix()->expand(w, affixes, exp_buf);
941
0
        } else if (*affixes && partially_expand) {
942
          // expand any affixes which will effect the first
943
          // 3 letters of a word.  This is needed so that the
944
          // jump tables will function correctly
945
0
          exp_buf.reset();
946
0
          exp_list = lang.affix()->expand(w, affixes, exp_buf, 3);
947
0
        } else {
948
0
          single.word.str = w;
949
0
          single.word.size = strlen(w);
950
0
          single.aff = (const byte *)affixes;
951
0
          exp_list = &single;
952
0
        }
953
954
        // iterate through each expanded word
955
        
956
0
        for (WordAff * p = exp_list; p; p = p->next)
957
0
        {
958
0
          const char * w = p->word.str;
959
0
          s = p->word.size;
960
          
961
0
          unsigned total_size = WordData::struct_size;
962
0
          unsigned data_size = s + 1;
963
0
          unsigned aff_size = strlen((const char *)p->aff);
964
0
          if (aff_size > 0) data_size += aff_size + 1;
965
0
          total_size += data_size;
966
0
          lang.to_soundslike(sl_buf, w);
967
0
          const char * sl = sl_buf.str();
968
0
          unsigned sl_size = sl_buf.size();
969
0
          if (strcmp(sl,w) == 0) sl = w;
970
0
          if (sl != w) total_size += sl_size + 1;
971
972
0
          if (total_size - WordData::struct_size > 240)
973
0
            return make_err(invalid_word, MsgConv(lang)(w),
974
0
                            _("The total word length, with soundslike data, is larger than 240 characters."));
975
976
0
          WordData * b = (WordData *)buf.alloc(total_size, sizeof(void *));
977
0
          *prev = b;
978
0
          b->next = 0;
979
0
          prev = &b->next;
980
          
981
0
          b->word_size = s;
982
0
          b->sl_size = strlen(sl);
983
0
          b->data_size = data_size;
984
0
          b->flags = lang.get_word_info(w);
985
986
0
          char * z = b->word;
987
988
0
          memcpy(z, w, s + 1);
989
0
          z += s + 1;
990
991
0
          if (aff_size > 0) {
992
0
            b->flags |= HAVE_AFFIX_FLAG;
993
0
            b->aff = z;
994
0
            memcpy(z, p->aff, aff_size + 1);
995
0
            z += aff_size + 1;
996
0
          } else {
997
0
            b->aff = 0;
998
0
          }
999
1000
0
          if (sl != w) {
1001
0
            memcpy(z, sl, sl_size + 1);
1002
0
            b->sl = z;
1003
0
          } else {
1004
0
            b->sl = b->word;
1005
0
          }
1006
1007
0
        }
1008
0
      }
1009
0
      delete els;
1010
0
    }
1011
1012
    //
1013
    // sort WordData linked list based on (sl, word)
1014
    //
1015
1016
0
    first = sort(first, SoundslikeLess(&lang));
1017
1018
    //
1019
    // duplicate check
1020
    // 
1021
0
    WordData * prev = first;
1022
0
    WordData * cur = first ? first->next : 0;
1023
0
    InsensitiveEqual ieq(&lang);
1024
0
    while (cur) {
1025
0
      if (strcmp(prev->word, cur->word) == 0) {
1026
        // merge affix info if necessary
1027
0
        if (!prev->aff && cur->aff) {
1028
0
          prev->flags |= HAVE_AFFIX_FLAG;
1029
0
          prev->aff = cur->aff;
1030
0
          prev->data_size += strlen(prev->aff) + 1;
1031
0
        } else if (prev->aff && cur->aff) {
1032
0
          unsigned l1 = strlen(prev->aff);
1033
0
          unsigned l2 = strlen(cur->aff);
1034
0
          char * aff = (char *)buf.alloc(l1 + l2 + 1);
1035
0
          memcpy(aff, prev->aff, l1);
1036
0
          prev->aff = aff;
1037
0
          aff += l1;
1038
0
          for (const char * p = cur->aff; *p; ++p) {
1039
0
            if (memchr(prev->aff, *p, l1)) continue;
1040
0
            *aff = *p;
1041
0
            ++aff;
1042
0
          }
1043
0
          *aff = '\0';
1044
0
          prev->data_size = prev->word_size + (aff - prev->aff) + 2;
1045
0
        }
1046
0
        prev->next = cur->next;
1047
0
      } else {
1048
0
        if (ieq(prev->word, cur->word)) prev->flags |= DUPLICATE_FLAG;
1049
0
        else ++uniq_entries;
1050
0
        ++num_entries;
1051
0
        prev = cur;
1052
0
      }
1053
0
      cur = cur->next;
1054
0
    }
1055
1056
    //
1057
    //
1058
    //
1059
1060
0
    unsigned data_size = 16;
1061
0
    WordData * p = first;
1062
0
    if (invisible_soundslike) {
1063
      
1064
0
      for (; p; p = p->next)
1065
0
        data_size += 3 + p->data_size;
1066
1067
0
    } else {
1068
1069
0
      while (p)
1070
0
      {
1071
0
        unsigned ds = 2 + p->sl_size + 1;
1072
1073
0
        char * prev = p->sl;
1074
1075
0
        do {
1076
          
1077
0
          ds += 3 + p->data_size;
1078
0
          p->sl = prev;
1079
0
          p = p->next;
1080
1081
0
        } while (p && strcmp(prev, p->sl) == 0 && ds + 3 + p->data_size < 255);
1082
1083
0
        data_size += ds;
1084
1085
0
      }
1086
1087
0
    }
1088
1089
    //
1090
    // Create the final data structures
1091
    //
1092
1093
0
    CharVector     data;
1094
0
    data.reserve(data_size);
1095
0
    data.write32(0); // to avoid nasty special cases
1096
0
    unsigned int prev_pos = data.size();
1097
0
    data.write32(0);
1098
0
    unsigned prev_w_pos = data.size();
1099
1100
0
    WordLookup lookup(affix_compress 
1101
0
                      ? uniq_entries * 3 / 2 
1102
0
                      : uniq_entries * 5 / 4);
1103
0
    lookup.parms().block_begin = data.begin();
1104
0
    lookup.parms().hash .lang     = &lang;
1105
0
    lookup.parms().equal.cmp.lang = &lang;
1106
1107
0
    Vector<Jump> jump1;
1108
0
    Vector<Jump> jump2;
1109
1110
0
    const int head_size = invisible_soundslike ? 3 : 2;
1111
1112
0
    const char * prev_sl = "";
1113
0
    p = first;
1114
0
    while (p)
1115
0
    {
1116
0
      if (invisible_soundslike) {
1117
1118
0
        data.write(p->flags); // flags  
1119
0
        data.write('\0'); // place holder for offset to next item
1120
0
        data.write(p->word_size);
1121
1122
0
      } else {
1123
1124
0
        data.write('\0'); // place holder for offset to next item
1125
0
        data.write(p->sl_size);
1126
1127
0
      }
1128
        
1129
0
      if (strncmp(prev_sl, p->sl, 3) != 0) {
1130
        
1131
0
        Jump jump;
1132
0
        strncpy(jump.sl, p->sl, 3);
1133
0
        jump.loc = data.size();
1134
0
        jump2.push_back(jump);
1135
        
1136
0
        if (strncmp(prev_sl, p->sl, 2) != 0) {
1137
0
          Jump jump;
1138
0
          strncpy(jump.sl, p->sl, 2);
1139
0
          jump.loc = jump2.size() - 1;
1140
0
          jump1.push_back(jump);
1141
0
        }
1142
1143
0
        data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos - head_size + 1);
1144
        // when advanced to this position the offset byte will
1145
        // be null (since it will point to the null terminator
1146
        // of the last word) and will thus signal the end of the
1147
        // group
1148
        
1149
0
      } else {
1150
        
1151
0
        data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos);
1152
        
1153
0
      }
1154
        
1155
0
      prev_pos = data.size();
1156
0
      prev_sl = p->sl;
1157
1158
0
      if (invisible_soundslike) {
1159
        
1160
0
        unsigned pos = data.size();
1161
0
        prev_w_pos = data.size();
1162
0
        data.write(p->word, p->word_size + 1);
1163
0
        if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1164
0
        lookup.insert(pos);
1165
1166
0
        p = p->next;
1167
1168
0
      } else {
1169
1170
0
        data.write(p->sl, p->sl_size + 1);
1171
1172
        // write all word entries with the same soundslike
1173
1174
0
        do {
1175
0
          data.write(p->flags);
1176
0
          data.write(p->data_size + 3);
1177
0
          data.write(p->word_size);
1178
1179
0
          unsigned pos = data.size();
1180
0
          data[prev_w_pos - NEXT_O] = (byte)(pos - prev_w_pos);
1181
0
          data.write(p->word, p->word_size + 1);
1182
0
          if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1183
0
          lookup.insert(pos);
1184
1185
0
          prev_w_pos = pos;
1186
0
          prev_sl = p->sl;
1187
1188
0
          p = p->next;
1189
1190
0
        } while (p && prev_sl == p->sl); // yes I really mean to use pointer compare here
1191
0
      }
1192
0
    }
1193
1194
    // add special end case
1195
0
    if (data.size() % 2 != 0) data.write('\0');
1196
0
    data.write16(0);
1197
0
    data.write16(0);
1198
0
    data[prev_pos - NEXT_O] |= (byte)(data.size() - prev_pos);
1199
    
1200
0
    jump2.push_back(Jump());
1201
0
    jump1.push_back(Jump());
1202
    
1203
0
    data.write(0);
1204
0
    data.write(0);
1205
0
    data.write(0);
1206
1207
0
    if (invisible_soundslike)
1208
0
      data_head.first_word_offset = data[4 - NEXT_O] + 4;
1209
0
    else
1210
0
      data_head.first_word_offset = data[8 - NEXT_O] + 8;
1211
1212
0
    memset(data.data(), 0, 8);
1213
    
1214
    //CERR.printf("%d == %d\n", lookup.size(), uniq_entries);
1215
    //assert(lookup.size() == uniq_entries);
1216
1217
0
    data_head.word_count   = num_entries;
1218
0
    data_head.word_buckets = lookup.bucket_count();
1219
1220
0
    FStream out;
1221
0
    out.open(base, "wb");
1222
1223
0
    advance_file(out, data_head.head_size);
1224
1225
    // Write jump1 table
1226
0
    data_head.jump1_offset = out.tell() - data_head.head_size;
1227
0
    out.write(jump1.data(), jump1.size() * sizeof(Jump));
1228
    
1229
    // Write jump2 table
1230
0
    advance_file(out, round_up(out.tell(), DataHead::align));
1231
0
    data_head.jump2_offset = out.tell() - data_head.head_size;
1232
0
    out.write(jump2.data(), jump2.size() * sizeof(Jump));
1233
1234
    // Write data block
1235
0
    advance_file(out, round_up(out.tell(), DataHead::align));
1236
0
    data_head.word_offset = out.tell() - data_head.head_size;
1237
0
    out.write(data.data(), data.size());
1238
1239
    // Write hash
1240
0
    advance_file(out, round_up(out.tell(), DataHead::align));
1241
0
    data_head.hash_offset = out.tell() - data_head.head_size;
1242
0
    out.write(&lookup.vector().front(), lookup.vector().size() * 4);
1243
    
1244
    // calculate block size
1245
0
    advance_file(out, round_up(out.tell(), DataHead::align));
1246
0
    data_head.block_size = out.tell() - data_head.head_size;
1247
1248
    // write data head to file
1249
0
    out.seek(0);
1250
0
    out.write(&data_head, sizeof(DataHead));
1251
0
    out.write(" ", 1);
1252
0
    out.write(lang.name(), data_head.lang_name_size);
1253
0
    out.write(lang.soundslike_name(), data_head.soundslike_name_size);
1254
0
    out.write(lang.soundslike_version(), data_head.soundslike_version_size);
1255
1256
0
    return no_err;
1257
0
  }
1258
1259
}
1260
1261
namespace aspeller {
1262
  PosibErr<void> create_default_readonly_dict(StringEnumeration * els,
1263
                                              Config & config)
1264
0
  {
1265
0
    CachePtr<Language> lang;
1266
0
    PosibErr<Language *> res = new_language(config);
1267
0
    if (res.has_err()) return res;
1268
0
    lang.reset(res.data);
1269
0
    lang->set_lang_defaults(config);
1270
0
    RET_ON_ERR(create(els,*lang,config));
1271
0
    return no_err;
1272
0
  }
1273
}
1274