/src/aspell/modules/speller/default/readonly_ws.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of The New Aspell |
2 | | // Copyright (C) 2000-2001,2011 by Kevin Atkinson under the GNU LGPL |
3 | | // license version 2.0 or 2.1. You should have received a copy of the |
4 | | // LGPL license along with this library if you did not you can find it |
5 | | // at http://www.gnu.org/. |
6 | | |
7 | | // Aspell's main word list is laid out as follows: |
8 | | // |
9 | | // * header |
10 | | // * jump table for editdist 1 |
11 | | // * jump table for editdist 2 |
12 | | // * data block |
13 | | // * hash table |
14 | | |
15 | | // data block laid out as follows: |
16 | | // |
17 | | // Words: |
18 | | // (<8 bit frequency><8 bit: flags><8 bit: offset to next word> |
19 | | // <8 bit: word size><word><null> |
20 | | // [<affix info><null>][<category info><null>])+ |
21 | | // Words with soundslike: |
22 | | // (<8 bit: offset to next item><8 bit: soundslike size><soundslike> |
23 | | // <words with that soundlike>)+ |
24 | | // Flags are mapped as follows: |
25 | | // bits 0-3: word info |
26 | | // bit 4: duplicate flag |
27 | | // bit 5: <unused> |
28 | | // bit 6: have affix info |
29 | | // bit 7: have compound info |
30 | | |
31 | | #include <utility> |
32 | | using std::pair; |
33 | | |
34 | | #include <string.h> |
35 | | #include <stdio.h> |
36 | | //#include <errno.h> |
37 | | |
38 | | #include "settings.h" |
39 | | |
40 | | #include "block_vector.hpp" |
41 | | #include "config.hpp" |
42 | | #include "data.hpp" |
43 | | #include "data_util.hpp" |
44 | | #include "errors.hpp" |
45 | | #include "file_util.hpp" |
46 | | #include "fstream.hpp" |
47 | | #include "language.hpp" |
48 | | #include "stack_ptr.hpp" |
49 | | #include "objstack.hpp" |
50 | | #include "vector.hpp" |
51 | | #include "vector_hash-t.hpp" |
52 | | #include "check_list.hpp" |
53 | | #include "lsort.hpp" |
54 | | |
55 | | #include "iostream.hpp" |
56 | | |
57 | | #include "gettext.h" |
58 | | |
59 | | typedef unsigned int u32int; |
60 | | static const u32int u32int_max = (u32int)-1; |
61 | | typedef unsigned short u16int; |
62 | | typedef unsigned char byte; |
63 | | |
64 | | #ifdef USE_32_BIT_HASH_FUN |
65 | | typedef u32int hash_int_t; |
66 | | #else |
67 | | typedef size_t hash_int_t; |
68 | | #endif |
69 | | |
70 | | #ifdef HAVE_MMAP |
71 | | |
72 | | // POSIX headers |
73 | | #include <fcntl.h> |
74 | | #include <unistd.h> |
75 | | #include <sys/mman.h> |
76 | | |
77 | | #endif |
78 | | |
79 | | #ifndef MAP_FAILED |
80 | | #define MAP_FAILED (-1) |
81 | | #endif |
82 | | |
83 | | #ifdef HAVE_MMAP |
84 | | |
85 | | static inline char * mmap_open(unsigned int block_size, |
86 | | FStream & f, |
87 | | unsigned int offset) |
88 | 1.44k | { |
89 | 1.44k | f.flush(); |
90 | 1.44k | int fd = f.file_no(); |
91 | 1.44k | return static_cast<char *> |
92 | 1.44k | (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset)); |
93 | 1.44k | } |
94 | | |
95 | | static inline void mmap_free(char * block, unsigned int size) |
96 | 1.44k | { |
97 | 1.44k | munmap(block, size); |
98 | 1.44k | } |
99 | | |
100 | | #else |
101 | | |
102 | | static inline char * mmap_open(unsigned int, |
103 | | FStream & f, |
104 | | unsigned int) |
105 | | { |
106 | | return reinterpret_cast<char *>(MAP_FAILED); |
107 | | } |
108 | | |
109 | | static inline void mmap_free(char *, unsigned int) |
110 | | { |
111 | | abort(); |
112 | | } |
113 | | |
114 | | #endif |
115 | | |
116 | | static byte HAVE_AFFIX_FLAG = 1 << 7; |
117 | | static byte HAVE_CATEGORY_FLAG = 1 << 6; |
118 | | |
119 | | static byte DUPLICATE_FLAG = 1 << 4; |
120 | | // this flag is set when there is is more than one word for a |
121 | | // particulear "clean" word such as "jello" "Jello". It is set on all |
122 | | // but the last word of the group. I.e., if it is set, then the next |
123 | | // word when converted to its "clean" form equals the same value. |
124 | | |
125 | | static byte WORD_INFO_MASK = 0x0F; |
126 | | |
127 | | static const int FREQUENCY_INFO_O = 4; |
128 | | static const int FLAGS_O = 3; |
129 | | static const int NEXT_O = 2; |
130 | | static const int WORD_SIZE_O = 1; |
131 | | |
132 | 33.1M | static inline int get_word_size(const char * d) { |
133 | 33.1M | return *reinterpret_cast<const byte *>(d - WORD_SIZE_O); |
134 | 33.1M | } |
135 | | |
136 | 21.7M | static inline byte get_flags(const char * d) { |
137 | 21.7M | return *reinterpret_cast<const byte *>(d - FLAGS_O); |
138 | 21.7M | } |
139 | | |
140 | 26.3M | static inline byte get_offset(const char * d) { |
141 | 26.3M | return *reinterpret_cast<const byte *>(d - NEXT_O); |
142 | 26.3M | } |
143 | | |
144 | 35.5M | static inline const char * get_next(const char * d) { |
145 | 35.5M | return d + *reinterpret_cast<const byte *>(d - NEXT_O); |
146 | 35.5M | } |
147 | | |
148 | 672k | static inline const char * get_sl_words_begin(const char * d) { |
149 | 672k | return d + *reinterpret_cast<const byte *>(d - WORD_SIZE_O) + 4; |
150 | | // FIXME: This isn't right when frequency info is stored in the table |
151 | 672k | } |
152 | | |
153 | | // get_next might go past the end so don't JUST compare |
154 | | // for equality. Ie use while (cur < end) not (cur != end) |
155 | 672k | static inline const char * get_sl_words_end(const char * d) { |
156 | 672k | return get_next(d) - 3; |
157 | 672k | } |
158 | | |
159 | 10.7M | static inline const char * get_affix(const char * d) { |
160 | 10.7M | int word_size = get_word_size(d); |
161 | 10.7M | if (get_flags(d) & HAVE_AFFIX_FLAG) |
162 | 1.50M | return d + word_size + 1; |
163 | 9.19M | else |
164 | 9.19M | return d + word_size; |
165 | 10.7M | } |
166 | | |
167 | 0 | static inline const char * get_category(const char * d) { |
168 | 0 | int word_size = get_word_size(d); |
169 | 0 | if (get_flags(d) & (HAVE_AFFIX_FLAG | HAVE_CATEGORY_FLAG)) |
170 | 0 | return d + strlen(d + word_size + 1) + 1; |
171 | 0 | else if (get_flags(d) & HAVE_CATEGORY_FLAG) |
172 | 0 | return d + word_size + 1; |
173 | 0 | else |
174 | 0 | return d + word_size; |
175 | 0 | } |
176 | | |
177 | 317k | static inline bool duplicate_flag(const char * d) { |
178 | 317k | return get_flags(d) & DUPLICATE_FLAG; |
179 | 317k | } |
180 | | |
181 | | namespace { |
182 | | |
183 | | using namespace aspeller; |
184 | | |
185 | | ///////////////////////////////////////////////////////////////////// |
186 | | // |
187 | | // ReadOnlyDict |
188 | | // |
189 | | |
190 | | struct Jump |
191 | | { |
192 | | char sl[4]; |
193 | | u32int loc; |
194 | 0 | Jump() {memset(this, 0, sizeof(Jump));} |
195 | | }; |
196 | | |
197 | | class ReadOnlyDict : public Dictionary |
198 | | { |
199 | | |
200 | | public: //but don't use |
201 | | |
202 | | struct WordLookupParms { |
203 | | const char * block_begin; |
204 | 1.44k | WordLookupParms() {} |
205 | | typedef BlockVector<const u32int> Vector; |
206 | | typedef u32int Value; |
207 | | typedef const char * Key; |
208 | | static const bool is_multi = false; |
209 | 29.5M | Key key(Value v) const {return block_begin + v;} |
210 | | InsensitiveHash<hash_int_t> hash; |
211 | | InsensitiveEqual equal; |
212 | 44.5M | bool is_nonexistent(Value v) const {return v == u32int_max;} |
213 | 0 | void make_nonexistent(const Value & v) const {abort();} |
214 | | }; |
215 | | typedef VectorHashTable<WordLookupParms> WordLookup; |
216 | | |
217 | | public: // but don't use |
218 | | |
219 | | char * block; |
220 | | u32int block_size; |
221 | | char * mmaped_block; |
222 | | u32int mmaped_size; |
223 | | const Jump * jump1; |
224 | | const Jump * jump2; |
225 | | WordLookup word_lookup; |
226 | | const char * word_block; |
227 | | const char * first_word; |
228 | | |
229 | | ReadOnlyDict(const ReadOnlyDict&); |
230 | | ReadOnlyDict& operator= (const ReadOnlyDict&); |
231 | | |
232 | | struct Elements; |
233 | | struct SoundslikeElements; |
234 | | |
235 | | public: |
236 | | WordEntryEnumeration * detailed_elements() const; |
237 | | Size size() const; |
238 | | bool empty() const; |
239 | | |
240 | | ReadOnlyDict() |
241 | | : Dictionary(basic_dict, "ReadOnlyDict") |
242 | 1.44k | { |
243 | 1.44k | block = 0; |
244 | 1.44k | } |
245 | | |
246 | 1.44k | ~ReadOnlyDict() { |
247 | 1.44k | if (block != 0) { |
248 | 1.44k | if (mmaped_block) |
249 | 1.44k | mmap_free(mmaped_block, mmaped_size); |
250 | 0 | else |
251 | 0 | free(block); |
252 | 1.44k | } |
253 | 1.44k | } |
254 | | |
255 | | PosibErr<void> load(ParmString, Config &, DictList *, SpellerImpl *); |
256 | | PosibErr<void> check_hash_fun() const; |
257 | | void low_level_dump() const; |
258 | | |
259 | | bool lookup(ParmString word, const SensitiveCompare *, WordEntry &) const; |
260 | | |
261 | | bool clean_lookup(ParmString, WordEntry &) const; |
262 | | |
263 | | bool soundslike_lookup(const WordEntry &, WordEntry &) const; |
264 | | bool soundslike_lookup(ParmString, WordEntry &) const; |
265 | | |
266 | | SoundslikeEnumeration * soundslike_elements() const; |
267 | | |
268 | | }; |
269 | | |
270 | 10.7M | static inline void convert(const char * w, WordEntry & o) { |
271 | 10.7M | o.what = WordEntry::Word; |
272 | 10.7M | o.word = w; |
273 | 10.7M | o.aff = get_affix(w); |
274 | 10.7M | o.word_size = get_word_size(w); |
275 | 10.7M | o.word_info = get_flags(w) & WORD_INFO_MASK; |
276 | 10.7M | } |
277 | | |
278 | | // |
279 | | // |
280 | | // |
281 | | |
282 | | struct ReadOnlyDict::Elements : public WordEntryEnumeration |
283 | | { |
284 | | const char * w; |
285 | | WordEntry wi; |
286 | 0 | Elements(const char * w0) : w(w0) {wi.what = WordEntry::Word;} |
287 | 0 | WordEntry * next() { |
288 | 0 | if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3 |
289 | | // when freq info is used |
290 | 0 | if (get_offset(w) == 0) return 0; |
291 | 0 | convert(w, wi); |
292 | 0 | w = get_next(w); |
293 | 0 | return &wi; |
294 | 0 | } |
295 | 0 | bool at_end() const {return get_offset(w) == 0;} |
296 | 0 | WordEntryEnumeration * clone() const {return new Elements(*this);} |
297 | 0 | void assign (const WordEntryEnumeration * other) { |
298 | 0 | *this = *static_cast<const Elements *>(other);} |
299 | | }; |
300 | | |
301 | 0 | WordEntryEnumeration * ReadOnlyDict::detailed_elements() const { |
302 | 0 | return new Elements(first_word); |
303 | 0 | } |
304 | | |
305 | 0 | void ReadOnlyDict::low_level_dump() const { |
306 | 0 | bool next_dup = false; |
307 | 0 | const char * w = first_word; |
308 | 0 | for (;;) { |
309 | 0 | if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3 |
310 | 0 | // when freq info is used |
311 | 0 | if (get_offset(w) == 0) break; |
312 | 0 | |
313 | 0 | const char * aff = get_affix(w); |
314 | 0 | byte flags = get_flags(w); |
315 | 0 | byte word_info = flags & WORD_INFO_MASK; |
316 | 0 | byte offset = get_offset(w); |
317 | 0 | int size = get_word_size(w); |
318 | 0 | if (next_dup) printf("\\"); |
319 | 0 | printf("%s", w); |
320 | 0 | if (flags & HAVE_AFFIX_FLAG) printf("/%s", aff); |
321 | 0 | if (word_info) printf(" [WI: %d]", word_info); |
322 | 0 | //if (flags & DUPLICATE_FLAG) printf(" [NEXT DUP]"); |
323 | 0 | const char * p = w; |
324 | 0 | WordLookup::const_iterator i = word_lookup.find(w); |
325 | 0 | if (!next_dup) { |
326 | 0 | if (i == word_lookup.end()) |
327 | 0 | printf(" <BAD HASH>"); |
328 | 0 | else if (word_block + *i != w) { |
329 | 0 | printf(" <BAD HASH, got %s>", word_block + *i); |
330 | 0 | } |
331 | 0 | else |
332 | 0 | printf(" <hash ok>"); |
333 | 0 | } |
334 | 0 | printf("\n"); |
335 | 0 | String buf; |
336 | 0 | if (flags & DUPLICATE_FLAG) next_dup = true; |
337 | 0 | else next_dup = false; |
338 | 0 | w = get_next(w); |
339 | 0 | } |
340 | 0 | } |
341 | | |
342 | 1.44k | PosibErr<void> ReadOnlyDict::check_hash_fun() const { |
343 | 1.44k | const char * w = first_word; |
344 | 57.5k | for (;;) { |
345 | 57.5k | if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3 |
346 | | // when freq info is used |
347 | 57.5k | if (get_offset(w) == 0) break; |
348 | 57.5k | if (get_word_size(w) >= 12) { |
349 | 2.87k | const char * p = w; |
350 | 2.87k | int clean_size = 0; |
351 | 36.6k | for (;;) { |
352 | 36.6k | if (!*p) goto next; // reached end before clean_size was at |
353 | | // least 12, thus skip |
354 | 35.2k | if (lang()->to_clean(*p)) ++clean_size; |
355 | 35.2k | if (clean_size >= 12) goto clean_size_ok; |
356 | 33.7k | ++p; |
357 | 33.7k | } |
358 | 1.44k | clean_size_ok: |
359 | 1.44k | WordLookup::const_iterator i = word_lookup.find(w); |
360 | 1.44k | if (i == word_lookup.end() || word_block + *i != w) |
361 | 0 | return make_err(bad_file_format, file_name(), |
362 | 0 | _("Incompatible hash function.")); |
363 | 1.44k | else |
364 | 1.44k | return no_err; |
365 | 1.44k | } |
366 | 56.0k | next: |
367 | 67.5k | while (get_flags(w) & DUPLICATE_FLAG) |
368 | 11.5k | w = get_next(w); |
369 | 56.0k | w = get_next(w); |
370 | 56.0k | } |
371 | 0 | return no_err; |
372 | 1.44k | } |
373 | | |
374 | 3.54k | ReadOnlyDict::Size ReadOnlyDict::size() const { |
375 | 3.54k | return word_lookup.size(); |
376 | 3.54k | } |
377 | | |
378 | 0 | bool ReadOnlyDict::empty() const { |
379 | 0 | return word_lookup.empty(); |
380 | 0 | } |
381 | | |
382 | | static const char * const cur_check_word = "aspell default speller rowl 1.10"; |
383 | | |
384 | | struct DataHead { |
385 | | // all sizes except the last four must to divisible by: |
386 | | static const unsigned int align = 16; |
387 | | char check_word[64]; |
388 | | u32int endian_check; // = 12345678 |
389 | | char lang_hash[16]; |
390 | | |
391 | | u32int head_size; |
392 | | u32int block_size; |
393 | | u32int jump1_offset; |
394 | | u32int jump2_offset; |
395 | | u32int word_offset; |
396 | | u32int hash_offset; |
397 | | |
398 | | u32int word_count; |
399 | | u32int word_buckets; |
400 | | u32int soundslike_count; |
401 | | |
402 | | u32int dict_name_size; |
403 | | u32int lang_name_size; |
404 | | u32int soundslike_name_size; |
405 | | u32int soundslike_version_size; |
406 | | |
407 | | u32int first_word_offset; // from word block |
408 | | |
409 | | byte affix_info; // 0 = none, 1 = partially expanded, 2 = full |
410 | | byte invisible_soundslike; |
411 | | byte soundslike_root_only; |
412 | | byte compound_info; // |
413 | | byte freq_info; |
414 | | }; |
415 | | |
416 | | PosibErr<void> ReadOnlyDict::load(ParmString f0, Config & config, |
417 | | DictList *, SpellerImpl *) |
418 | 1.44k | { |
419 | 1.44k | set_file_name(f0); |
420 | 1.44k | const char * fn = file_name(); |
421 | | |
422 | 1.44k | FStream f; |
423 | 1.44k | RET_ON_ERR(f.open(fn, "rb")); |
424 | | |
425 | 1.44k | DataHead data_head; |
426 | | |
427 | 1.44k | f.read(&data_head, sizeof(DataHead)); |
428 | | |
429 | | #if 0 |
430 | | COUT << "Head Size: " << data_head.head_size << "\n"; |
431 | | COUT << "Data Block Size: " << data_head.data_block_size << "\n"; |
432 | | COUT << "Hash Block Size: " << data_head.hash_block_size << "\n"; |
433 | | COUT << "Total Block Size: " << data_head.total_block_size << "\n"; |
434 | | #endif |
435 | | |
436 | 1.44k | if (strcmp(data_head.check_word, cur_check_word) != 0) |
437 | 0 | return make_err(bad_file_format, fn); |
438 | | |
439 | 1.44k | if (data_head.endian_check != 12345678) |
440 | 0 | return make_err(bad_file_format, fn, _("Wrong endian order.")); |
441 | | |
442 | 1.44k | CharVector word; |
443 | | |
444 | 1.44k | word.resize(data_head.dict_name_size); |
445 | 1.44k | f.read(word.data(), data_head.dict_name_size); |
446 | | |
447 | 1.44k | word.resize(data_head.lang_name_size); |
448 | 1.44k | f.read(word.data(), data_head.lang_name_size); |
449 | | |
450 | 1.44k | PosibErr<void> pe = set_check_lang(word.data(),config); |
451 | 1.44k | if (pe.has_err()) { |
452 | 4 | if (pe.prvw_err()->is_a(language_related_error)) |
453 | 0 | return pe.with_file(fn); |
454 | 4 | else |
455 | 4 | return pe; |
456 | 4 | } |
457 | | |
458 | 1.44k | if (data_head.soundslike_name_size != 0) { |
459 | 1.44k | word.resize(data_head.soundslike_name_size); |
460 | 1.44k | f.read(word.data(), data_head.soundslike_name_size); |
461 | | |
462 | 1.44k | if (strcmp(word.data(), lang()->soundslike_name()) != 0) |
463 | 0 | return make_err(bad_file_format, fn, _("Wrong soundslike.")); |
464 | | |
465 | 1.44k | word.resize(data_head.soundslike_version_size); |
466 | 1.44k | f.read(word.data(), data_head.soundslike_version_size); |
467 | | |
468 | 1.44k | if (strcmp(word.data(), lang()->soundslike_version()) != 0) |
469 | 0 | return make_err(bad_file_format, fn, _("Wrong soundslike version.")); |
470 | 1.44k | } |
471 | | |
472 | 1.44k | invisible_soundslike = data_head.invisible_soundslike; |
473 | 1.44k | soundslike_root_only = data_head.soundslike_root_only; |
474 | | |
475 | 1.44k | affix_compressed = data_head.affix_info; |
476 | | |
477 | 1.44k | block_size = data_head.block_size; |
478 | 1.44k | int offset = data_head.head_size; |
479 | 1.44k | mmaped_block = mmap_open(block_size + offset, f, 0); |
480 | 1.44k | if( mmaped_block != (char *)MAP_FAILED) { |
481 | 1.44k | block = mmaped_block + offset; |
482 | 1.44k | mmaped_size = block_size + offset; |
483 | 1.44k | } else { |
484 | 0 | mmaped_block = 0; |
485 | 0 | block = (char *)malloc(block_size); |
486 | 0 | f.seek(data_head.head_size); |
487 | 0 | f.read(block, block_size); |
488 | 0 | } |
489 | | |
490 | 1.44k | if (data_head.jump2_offset) { |
491 | 1.44k | fast_scan = true; |
492 | 1.44k | jump1 = reinterpret_cast<const Jump *>(block + data_head.jump1_offset); |
493 | 1.44k | jump2 = reinterpret_cast<const Jump *>(block + data_head.jump2_offset); |
494 | 1.44k | } else { |
495 | 0 | jump1 = jump2 = 0; |
496 | 0 | } |
497 | | |
498 | 1.44k | word_block = block + data_head.word_offset; |
499 | 1.44k | first_word = word_block + data_head.first_word_offset; |
500 | | |
501 | 1.44k | word_lookup.parms().block_begin = word_block; |
502 | 1.44k | word_lookup.parms().hash .lang = lang(); |
503 | 1.44k | word_lookup.parms().equal.cmp.lang = lang(); |
504 | 1.44k | const u32int * begin = reinterpret_cast<const u32int *> |
505 | 1.44k | (block + data_head.hash_offset); |
506 | 1.44k | word_lookup.vector().set(begin, begin + data_head.word_buckets); |
507 | 1.44k | word_lookup.set_size(data_head.word_count); |
508 | | |
509 | | //low_level_dump(); |
510 | 1.44k | RET_ON_ERR(check_hash_fun()); |
511 | | |
512 | 1.44k | return no_err; |
513 | 1.44k | } |
514 | | |
515 | | void lookup_adv(WordEntry * wi); |
516 | | |
517 | | static inline void prep_next(WordEntry * wi, |
518 | | const char * w, |
519 | | const SensitiveCompare * c, |
520 | | const char * orig) |
521 | 17.8k | { |
522 | 17.9k | loop: |
523 | 17.9k | if (!duplicate_flag(w)) return; |
524 | 1.13k | w = get_next(w); |
525 | 1.13k | if (!(*c)(orig, w)) goto loop; |
526 | 1.01k | wi->intr[0] = (void *)w; |
527 | 1.01k | wi->intr[1] = (void *)c; |
528 | 1.01k | wi->intr[2] = (void *)orig; |
529 | 1.01k | wi->adv_ = lookup_adv; |
530 | 1.01k | } |
531 | | |
532 | | void lookup_adv(WordEntry * wi) |
533 | 167 | { |
534 | 167 | const char * w = (const char *)wi->intr[0]; |
535 | 167 | const SensitiveCompare * c = (const SensitiveCompare *)wi->intr[1]; |
536 | 167 | const char * orig = (const char *)wi->intr[2]; |
537 | 167 | convert(w,*wi); |
538 | 167 | wi->adv_ = 0; |
539 | 167 | prep_next(wi, w, c, orig); |
540 | 167 | } |
541 | | |
542 | | bool ReadOnlyDict::lookup(ParmString word, const SensitiveCompare * c, |
543 | | WordEntry & o) const |
544 | 689k | { |
545 | 689k | o.clear(); |
546 | 689k | WordLookup::const_iterator i = word_lookup.find(word); |
547 | 689k | if (i == word_lookup.end()) return false; |
548 | 56.8k | const char * w = word_block + *i; |
549 | 93.7k | for (;;) { |
550 | 93.7k | if ((*c)(word, w)) { |
551 | 17.6k | convert(w,o); |
552 | 17.6k | prep_next(&o, w, c, word); |
553 | 17.6k | return true; |
554 | 17.6k | } |
555 | 76.0k | if (!duplicate_flag(w)) break; |
556 | 36.9k | w = get_next(w); |
557 | 36.9k | } |
558 | 39.1k | return false; |
559 | 56.8k | } |
560 | | |
561 | | struct ReadOnlyDict::SoundslikeElements : public SoundslikeEnumeration |
562 | | { |
563 | | WordEntry data; |
564 | | const ReadOnlyDict * obj; |
565 | | const Jump * jump1; |
566 | | const Jump * jump2; |
567 | | const char * cur; |
568 | | const char * prev; |
569 | | int level; |
570 | | bool invisible_soundslike; |
571 | | |
572 | | WordEntry * next(int stopped_at); |
573 | | |
574 | | SoundslikeElements(const ReadOnlyDict * o) |
575 | | : obj(o), jump1(obj->jump1), jump2(obj->jump2), cur(0), |
576 | 17.8k | level(1), invisible_soundslike(o->invisible_soundslike) { |
577 | 17.8k | data.what = o->invisible_soundslike ? WordEntry::Word : WordEntry::Soundslike;} |
578 | | }; |
579 | | |
580 | 23.5M | WordEntry * ReadOnlyDict::SoundslikeElements::next(int stopped_at) { |
581 | | |
582 | | //CERR << level << ":" << stopped_at << " :"; |
583 | | //CERR << jump1->sl << ":" << jump2->sl << "\n"; |
584 | | |
585 | 38.1M | loop: |
586 | | |
587 | 38.1M | const char * tmp = cur; |
588 | 38.1M | const char * p; |
589 | | |
590 | 38.1M | if (level == 1 && stopped_at < 2) { |
591 | | |
592 | 2.83M | ++jump1; |
593 | 2.83M | tmp = jump1->sl; |
594 | 2.83M | goto jquit; |
595 | | |
596 | 35.2M | } else if (level == 2 && stopped_at < 3) { |
597 | | |
598 | 6.40M | ++jump2; |
599 | 6.40M | if (jump2[-1].sl[1] != jump2[0].sl[1]) { |
600 | 779k | ++jump1; |
601 | 779k | level = 1; |
602 | 779k | tmp = jump1->sl; |
603 | 5.62M | } else { |
604 | 5.62M | tmp = jump2->sl; |
605 | 5.62M | } |
606 | 6.40M | goto jquit; |
607 | | |
608 | 28.8M | } else if (level == 1) { |
609 | | |
610 | 999k | level = 2; |
611 | 999k | jump2 = obj->jump2 + jump1->loc; |
612 | 999k | tmp = jump2->sl; |
613 | 999k | goto jquit; |
614 | | |
615 | 27.8M | } else if (level == 2) { |
616 | | |
617 | 1.63M | tmp = cur = obj->word_block + jump2->loc; |
618 | 1.63M | level = 3; |
619 | | |
620 | 26.2M | } else if (get_offset(cur) == 0) { |
621 | | |
622 | 1.63M | level = 2; |
623 | 1.63M | ++jump2; |
624 | 1.63M | if (jump2[-1].sl[1] != jump2[0].sl[1]) { |
625 | 219k | level = 1; |
626 | 219k | ++jump1; |
627 | 219k | tmp = jump1->sl; |
628 | 1.41M | } else { |
629 | 1.41M | tmp = jump2->sl; |
630 | 1.41M | } |
631 | 1.63M | goto jquit; |
632 | | |
633 | 1.63M | } |
634 | | |
635 | 26.2M | cur = get_next(cur); // this will be the NEXT item looked at |
636 | | |
637 | 26.2M | p = prev; |
638 | 26.2M | prev = tmp; |
639 | 26.2M | if (p) { |
640 | | // PRECOND: |
641 | | // unless stopped_at >= LARGE_NUM |
642 | | // strlen(p) >= stopped_at |
643 | | // (stopped_at >= 3) implies |
644 | | // strncmp(p, tmp, 3) == 0 if !invisible_soundslike |
645 | | // strncmp(to_sl(p), to_sl(tmp), 3) == 0 if invisible_soundslike |
646 | 24.5M | if (stopped_at == 3) { |
647 | 16.6M | if (p[3] == tmp[3]) goto loop; |
648 | 16.6M | } else if (stopped_at == 4) { |
649 | 2.43M | if (p[3] == tmp[3] && tmp[3] && |
650 | 2.43M | p[4] == tmp[4]) goto loop; |
651 | 5.55M | } else if (stopped_at == 5) { |
652 | 368k | if (p[3] == tmp[3] && tmp[3] && |
653 | 368k | p[4] == tmp[4] && tmp[4] && |
654 | 368k | p[5] == tmp[5]) goto loop; |
655 | 368k | } |
656 | 24.5M | } |
657 | | |
658 | 11.7M | data.word = tmp; |
659 | 11.7M | data.word_size = get_word_size(tmp); |
660 | 11.7M | if (invisible_soundslike) { |
661 | 1.31M | convert(tmp, data); |
662 | 1.31M | } |
663 | 11.7M | data.intr[0] = (void *)tmp; |
664 | | |
665 | 11.7M | return &data; |
666 | | |
667 | 11.8M | jquit: |
668 | 11.8M | prev = 0; |
669 | 11.8M | if (!*tmp) return 0; |
670 | 11.8M | data.word = tmp; |
671 | 11.8M | data.word_size = !tmp[1] ? 1 : !tmp[2] ? 2 : 3; |
672 | 11.8M | data.intr[0] = 0; |
673 | 11.8M | if (invisible_soundslike) { |
674 | 684k | data.what = WordEntry::Clean; |
675 | 684k | data.aff = 0; |
676 | 684k | } |
677 | 11.8M | return &data; |
678 | 11.8M | } |
679 | | |
680 | 17.8k | SoundslikeEnumeration * ReadOnlyDict::soundslike_elements() const { |
681 | | |
682 | 17.8k | return new SoundslikeElements(this); |
683 | | |
684 | 17.8k | } |
685 | | |
686 | | static void soundslike_next(WordEntry * w) |
687 | 8.52M | { |
688 | 8.52M | const char * cur = (const char *)(w->intr[0]); |
689 | 8.52M | const char * end = (const char *)(w->intr[1]); |
690 | 8.52M | convert(cur, *w); |
691 | 8.52M | cur = get_next(cur); |
692 | 8.52M | w->intr[0] = (void *)cur; |
693 | 8.52M | if (cur >= end) w->adv_ = 0; |
694 | 8.52M | } |
695 | | |
696 | | static void clean_lookup_adv(WordEntry * wi) |
697 | 54.9k | { |
698 | 54.9k | const char * w = wi->word; |
699 | 54.9k | w = get_next(w); |
700 | 54.9k | convert(w,*wi); |
701 | 54.9k | if (!duplicate_flag(w)) wi->adv_ = 0; |
702 | 54.9k | } |
703 | | |
704 | | bool ReadOnlyDict::clean_lookup(ParmString sl, WordEntry & o) const |
705 | 6.93M | { |
706 | 6.93M | o.clear(); |
707 | 6.93M | WordLookup::const_iterator i = word_lookup.find(sl); |
708 | 6.93M | if (i == word_lookup.end()) return false; |
709 | 168k | const char * w = word_block + *i; |
710 | 168k | convert(w, o); |
711 | 168k | if (duplicate_flag(w)) o.adv_ = clean_lookup_adv; |
712 | 168k | return true; |
713 | 6.93M | } |
714 | | |
715 | | bool ReadOnlyDict::soundslike_lookup(const WordEntry & s, WordEntry & w) const |
716 | 1.93M | { |
717 | 1.93M | if (s.intr[0] == 0) { |
718 | | |
719 | 640k | return false; |
720 | | |
721 | 1.29M | } else if (!invisible_soundslike) { |
722 | | |
723 | 672k | w.clear(); |
724 | 672k | w.what = WordEntry::Word; |
725 | 672k | w.intr[0] = (void *)get_sl_words_begin(s.word); |
726 | 672k | w.intr[1] = (void *)get_sl_words_end(s.word); |
727 | 672k | w.adv_ = soundslike_next; |
728 | 672k | soundslike_next(&w); |
729 | 672k | return true; |
730 | | |
731 | 672k | } else { |
732 | | |
733 | 623k | w.clear(); |
734 | 623k | w.what = WordEntry::Word; |
735 | 623k | convert(s.word, w); |
736 | 623k | return true; |
737 | | |
738 | 623k | } |
739 | 1.93M | } |
740 | | |
741 | | bool ReadOnlyDict::soundslike_lookup(ParmString s, WordEntry & w) const |
742 | 0 | { |
743 | 0 | if (invisible_soundslike) { |
744 | 0 | return ReadOnlyDict::clean_lookup(s,w); |
745 | 0 | } else { |
746 | 0 | return false; |
747 | 0 | } |
748 | 0 | } |
749 | | |
750 | | } |
751 | | |
752 | | namespace aspeller { |
753 | | |
754 | 1.44k | Dictionary * new_default_readonly_dict() { |
755 | 1.44k | return new ReadOnlyDict(); |
756 | 1.44k | } |
757 | | |
758 | | } |
759 | | |
760 | | namespace { |
761 | | |
762 | | // Possible: |
763 | | // No Affix Compression: |
764 | | // no soundslike |
765 | | // invisible soundslike |
766 | | // with soundslike |
767 | | // Affix Compression: |
768 | | // group by root: |
769 | | // no soundslike |
770 | | // invisible soundslike |
771 | | // with soundslike |
772 | | // expand prefix: |
773 | | // no soundslike |
774 | | // invisible soundslike |
775 | | |
776 | | using namespace aspeller; |
777 | | |
778 | | struct WordData { |
779 | | static const unsigned struct_size; |
780 | | WordData * next; |
781 | | char * sl; |
782 | | char * aff; |
783 | | byte word_size; |
784 | | byte sl_size; |
785 | | byte data_size; |
786 | | byte flags; |
787 | | char word[1]; |
788 | | }; |
789 | | |
790 | | const unsigned WordData::struct_size = sizeof(WordData) - 1; |
791 | | |
792 | | |
793 | | struct SoundslikeLess { |
794 | | InsensitiveCompare icomp; |
795 | 0 | SoundslikeLess(const Language * l) : icomp(l) {} |
796 | 0 | bool operator() (WordData * x, WordData * y) const { |
797 | 0 | int res = strcmp(x->sl, y->sl); |
798 | 0 | if (res != 0) return res < 0; |
799 | 0 | res = icomp(x->word, y->word); |
800 | 0 | if (res != 0) return res < 0; |
801 | 0 | return strcmp(x->word, y->word) < 0; |
802 | 0 | } |
803 | | }; |
804 | | |
805 | | struct WordLookupParms { |
806 | | const char * block_begin; |
807 | 0 | WordLookupParms() {} |
808 | | typedef acommon::Vector<u32int> Vector; |
809 | | typedef u32int Value; |
810 | | typedef const char * Key; |
811 | | static const bool is_multi = false; |
812 | 0 | Key key(Value v) const {return block_begin + v;} |
813 | | InsensitiveHash<hash_int_t> hash; |
814 | | InsensitiveEqual equal; |
815 | 0 | bool is_nonexistent(Value v) const {return v == u32int_max;} |
816 | 0 | void make_nonexistent(Value & v) const {v = u32int_max;} |
817 | | }; |
818 | | typedef VectorHashTable<WordLookupParms> WordLookup; |
819 | | |
820 | 0 | static inline unsigned int round_up(unsigned int i, unsigned int size) { |
821 | 0 | return ((i + size - 1)/size)*size; |
822 | 0 | } |
823 | | |
824 | 0 | static void advance_file(FStream & out, int pos) { |
825 | 0 | int diff = pos - out.tell(); |
826 | 0 | assert(diff >= 0); |
827 | 0 | for(; diff != 0; --diff) |
828 | 0 | out << '\0'; |
829 | 0 | } |
830 | | |
831 | | PosibErr<void> create (StringEnumeration * els, |
832 | | const Language & lang, |
833 | | Config & config) |
834 | 0 | { |
835 | 0 | assert(sizeof(u16int) == 2); |
836 | 0 | assert(sizeof(u32int) == 4); |
837 | | |
838 | 0 | bool full_soundslike = !(strcmp(lang.soundslike_name(), "none") == 0 || |
839 | 0 | strcmp(lang.soundslike_name(), "stripped") == 0 || |
840 | 0 | strcmp(lang.soundslike_name(), "simple") == 0); |
841 | |
|
842 | 0 | bool affix_compress = (lang.affix() && |
843 | 0 | config.retrieve_bool("affix-compress")); |
844 | |
|
845 | 0 | bool partially_expand = (affix_compress && |
846 | 0 | !full_soundslike && |
847 | 0 | config.retrieve_bool("partially-expand")); |
848 | |
|
849 | 0 | bool invisible_soundslike = false; |
850 | 0 | if (partially_expand) |
851 | 0 | invisible_soundslike = true; |
852 | 0 | else if (config.have("invisible-soundslike")) |
853 | 0 | invisible_soundslike = config.retrieve_bool("invisible-soundslike"); |
854 | 0 | else if (!full_soundslike) |
855 | 0 | invisible_soundslike = true; |
856 | |
|
857 | 0 | ConvEC iconv; |
858 | 0 | if (!config.have("norm-strict")) |
859 | 0 | config.replace("norm-strict", "true"); |
860 | 0 | if (config.have("encoding")) { |
861 | 0 | String enc = config.retrieve("encoding"); |
862 | 0 | RET_ON_ERR(iconv.setup(config, enc, lang.charmap(), NormFrom)); |
863 | 0 | } else { |
864 | 0 | RET_ON_ERR(iconv.setup(config, lang.data_encoding(), lang.charmap(), NormFrom)); |
865 | 0 | } |
866 | | |
867 | 0 | String base = config.retrieve("master-path"); |
868 | |
|
869 | 0 | DataHead data_head; |
870 | 0 | memset(&data_head, 0, sizeof(data_head)); |
871 | 0 | strcpy(data_head.check_word, cur_check_word); |
872 | |
|
873 | 0 | data_head.endian_check = 12345678; |
874 | |
|
875 | 0 | data_head.dict_name_size = 1; |
876 | 0 | data_head.lang_name_size = strlen(lang.name()) + 1; |
877 | 0 | data_head.soundslike_name_size = strlen(lang.soundslike_name()) + 1; |
878 | 0 | data_head.soundslike_version_size = strlen(lang.soundslike_version()) + 1; |
879 | 0 | data_head.head_size = sizeof(DataHead); |
880 | 0 | data_head.head_size += data_head.dict_name_size; |
881 | 0 | data_head.head_size += data_head.lang_name_size; |
882 | 0 | data_head.head_size += data_head.soundslike_name_size; |
883 | 0 | data_head.head_size += data_head.soundslike_version_size; |
884 | 0 | data_head.head_size = round_up(data_head.head_size, DataHead::align); |
885 | |
|
886 | 0 | data_head.affix_info = affix_compress ? partially_expand ? 1 : 2 : 0; |
887 | 0 | data_head.invisible_soundslike = invisible_soundslike; |
888 | 0 | data_head.soundslike_root_only = affix_compress && !partially_expand ? 1 : 0; |
889 | |
|
890 | | #if 0 |
891 | | CERR.printl("FLAGS: "); |
892 | | if (full_soundslike) CERR.printl(" full soundslike"); |
893 | | if (invisible_soundslike) CERR.printl(" invisible soundslike"); |
894 | | if (data_head.soundslike_root_only) CERR.printl(" soundslike root only"); |
895 | | if (affix_compress) CERR.printl(" affix compress"); |
896 | | if (partially_expand) CERR.printl(" partially expand"); |
897 | | CERR.printl("---"); |
898 | | #endif |
899 | | |
900 | 0 | String temp; |
901 | |
|
902 | 0 | int num_entries = 0; |
903 | 0 | int uniq_entries = 0; |
904 | | |
905 | 0 | ObjStack buf(16*1024); |
906 | 0 | String sl_buf; |
907 | |
|
908 | 0 | WordData * first = 0; |
909 | | |
910 | | // |
911 | | // Read in Wordlist |
912 | | // |
913 | 0 | { |
914 | 0 | WordListIterator wl_itr(els, &lang, config.retrieve_bool("warn") ? &CERR : 0); |
915 | 0 | wl_itr.init(config); |
916 | 0 | ObjStack exp_buf; |
917 | 0 | WordAff * exp_list; |
918 | 0 | WordAff single; |
919 | 0 | single.next = 0; |
920 | 0 | Vector<WordAff> af_list; |
921 | 0 | WordData * * prev = &first; |
922 | |
|
923 | 0 | for (;;) { |
924 | |
|
925 | 0 | PosibErr<bool> pe = wl_itr.adv(); |
926 | 0 | if (pe.has_err()) return pe; |
927 | 0 | if (!pe.data) break; |
928 | | |
929 | 0 | const char * w = wl_itr->word.str; |
930 | 0 | unsigned int s = wl_itr->word.size; |
931 | |
|
932 | 0 | const char * affixes = wl_itr->aff.str; |
933 | |
|
934 | 0 | if (*affixes && !lang.affix()) |
935 | 0 | return make_err(other_error, |
936 | 0 | _("Affix flags found in word but no affix file given.")); |
937 | | |
938 | 0 | if (*affixes && !affix_compress) { |
939 | 0 | exp_buf.reset(); |
940 | 0 | exp_list = lang.affix()->expand(w, affixes, exp_buf); |
941 | 0 | } else if (*affixes && partially_expand) { |
942 | | // expand any affixes which will effect the first |
943 | | // 3 letters of a word. This is needed so that the |
944 | | // jump tables will function correctly |
945 | 0 | exp_buf.reset(); |
946 | 0 | exp_list = lang.affix()->expand(w, affixes, exp_buf, 3); |
947 | 0 | } else { |
948 | 0 | single.word.str = w; |
949 | 0 | single.word.size = strlen(w); |
950 | 0 | single.aff = (const byte *)affixes; |
951 | 0 | exp_list = &single; |
952 | 0 | } |
953 | | |
954 | | // iterate through each expanded word |
955 | | |
956 | 0 | for (WordAff * p = exp_list; p; p = p->next) |
957 | 0 | { |
958 | 0 | const char * w = p->word.str; |
959 | 0 | s = p->word.size; |
960 | | |
961 | 0 | unsigned total_size = WordData::struct_size; |
962 | 0 | unsigned data_size = s + 1; |
963 | 0 | unsigned aff_size = strlen((const char *)p->aff); |
964 | 0 | if (aff_size > 0) data_size += aff_size + 1; |
965 | 0 | total_size += data_size; |
966 | 0 | lang.to_soundslike(sl_buf, w); |
967 | 0 | const char * sl = sl_buf.str(); |
968 | 0 | unsigned sl_size = sl_buf.size(); |
969 | 0 | if (strcmp(sl,w) == 0) sl = w; |
970 | 0 | if (sl != w) total_size += sl_size + 1; |
971 | |
|
972 | 0 | if (total_size - WordData::struct_size > 240) |
973 | 0 | return make_err(invalid_word, MsgConv(lang)(w), |
974 | 0 | _("The total word length, with soundslike data, is larger than 240 characters.")); |
975 | | |
976 | 0 | WordData * b = (WordData *)buf.alloc(total_size, sizeof(void *)); |
977 | 0 | *prev = b; |
978 | 0 | b->next = 0; |
979 | 0 | prev = &b->next; |
980 | | |
981 | 0 | b->word_size = s; |
982 | 0 | b->sl_size = strlen(sl); |
983 | 0 | b->data_size = data_size; |
984 | 0 | b->flags = lang.get_word_info(w); |
985 | |
|
986 | 0 | char * z = b->word; |
987 | |
|
988 | 0 | memcpy(z, w, s + 1); |
989 | 0 | z += s + 1; |
990 | |
|
991 | 0 | if (aff_size > 0) { |
992 | 0 | b->flags |= HAVE_AFFIX_FLAG; |
993 | 0 | b->aff = z; |
994 | 0 | memcpy(z, p->aff, aff_size + 1); |
995 | 0 | z += aff_size + 1; |
996 | 0 | } else { |
997 | 0 | b->aff = 0; |
998 | 0 | } |
999 | |
|
1000 | 0 | if (sl != w) { |
1001 | 0 | memcpy(z, sl, sl_size + 1); |
1002 | 0 | b->sl = z; |
1003 | 0 | } else { |
1004 | 0 | b->sl = b->word; |
1005 | 0 | } |
1006 | |
|
1007 | 0 | } |
1008 | 0 | } |
1009 | 0 | delete els; |
1010 | 0 | } |
1011 | | |
1012 | | // |
1013 | | // sort WordData linked list based on (sl, word) |
1014 | | // |
1015 | | |
1016 | 0 | first = sort(first, SoundslikeLess(&lang)); |
1017 | | |
1018 | | // |
1019 | | // duplicate check |
1020 | | // |
1021 | 0 | WordData * prev = first; |
1022 | 0 | WordData * cur = first ? first->next : 0; |
1023 | 0 | InsensitiveEqual ieq(&lang); |
1024 | 0 | while (cur) { |
1025 | 0 | if (strcmp(prev->word, cur->word) == 0) { |
1026 | | // merge affix info if necessary |
1027 | 0 | if (!prev->aff && cur->aff) { |
1028 | 0 | prev->flags |= HAVE_AFFIX_FLAG; |
1029 | 0 | prev->aff = cur->aff; |
1030 | 0 | prev->data_size += strlen(prev->aff) + 1; |
1031 | 0 | } else if (prev->aff && cur->aff) { |
1032 | 0 | unsigned l1 = strlen(prev->aff); |
1033 | 0 | unsigned l2 = strlen(cur->aff); |
1034 | 0 | char * aff = (char *)buf.alloc(l1 + l2 + 1); |
1035 | 0 | memcpy(aff, prev->aff, l1); |
1036 | 0 | prev->aff = aff; |
1037 | 0 | aff += l1; |
1038 | 0 | for (const char * p = cur->aff; *p; ++p) { |
1039 | 0 | if (memchr(prev->aff, *p, l1)) continue; |
1040 | 0 | *aff = *p; |
1041 | 0 | ++aff; |
1042 | 0 | } |
1043 | 0 | *aff = '\0'; |
1044 | 0 | prev->data_size = prev->word_size + (aff - prev->aff) + 2; |
1045 | 0 | } |
1046 | 0 | prev->next = cur->next; |
1047 | 0 | } else { |
1048 | 0 | if (ieq(prev->word, cur->word)) prev->flags |= DUPLICATE_FLAG; |
1049 | 0 | else ++uniq_entries; |
1050 | 0 | ++num_entries; |
1051 | 0 | prev = cur; |
1052 | 0 | } |
1053 | 0 | cur = cur->next; |
1054 | 0 | } |
1055 | | |
1056 | | // |
1057 | | // |
1058 | | // |
1059 | |
|
1060 | 0 | unsigned data_size = 16; |
1061 | 0 | WordData * p = first; |
1062 | 0 | if (invisible_soundslike) { |
1063 | | |
1064 | 0 | for (; p; p = p->next) |
1065 | 0 | data_size += 3 + p->data_size; |
1066 | |
|
1067 | 0 | } else { |
1068 | |
|
1069 | 0 | while (p) |
1070 | 0 | { |
1071 | 0 | unsigned ds = 2 + p->sl_size + 1; |
1072 | |
|
1073 | 0 | char * prev = p->sl; |
1074 | |
|
1075 | 0 | do { |
1076 | | |
1077 | 0 | ds += 3 + p->data_size; |
1078 | 0 | p->sl = prev; |
1079 | 0 | p = p->next; |
1080 | |
|
1081 | 0 | } while (p && strcmp(prev, p->sl) == 0 && ds + 3 + p->data_size < 255); |
1082 | |
|
1083 | 0 | data_size += ds; |
1084 | |
|
1085 | 0 | } |
1086 | |
|
1087 | 0 | } |
1088 | | |
1089 | | // |
1090 | | // Create the final data structures |
1091 | | // |
1092 | |
|
1093 | 0 | CharVector data; |
1094 | 0 | data.reserve(data_size); |
1095 | 0 | data.write32(0); // to avoid nasty special cases |
1096 | 0 | unsigned int prev_pos = data.size(); |
1097 | 0 | data.write32(0); |
1098 | 0 | unsigned prev_w_pos = data.size(); |
1099 | |
|
1100 | 0 | WordLookup lookup(affix_compress |
1101 | 0 | ? uniq_entries * 3 / 2 |
1102 | 0 | : uniq_entries * 5 / 4); |
1103 | 0 | lookup.parms().block_begin = data.begin(); |
1104 | 0 | lookup.parms().hash .lang = ⟨ |
1105 | 0 | lookup.parms().equal.cmp.lang = ⟨ |
1106 | |
|
1107 | 0 | Vector<Jump> jump1; |
1108 | 0 | Vector<Jump> jump2; |
1109 | |
|
1110 | 0 | const int head_size = invisible_soundslike ? 3 : 2; |
1111 | |
|
1112 | 0 | const char * prev_sl = ""; |
1113 | 0 | p = first; |
1114 | 0 | while (p) |
1115 | 0 | { |
1116 | 0 | if (invisible_soundslike) { |
1117 | |
|
1118 | 0 | data.write(p->flags); // flags |
1119 | 0 | data.write('\0'); // place holder for offset to next item |
1120 | 0 | data.write(p->word_size); |
1121 | |
|
1122 | 0 | } else { |
1123 | |
|
1124 | 0 | data.write('\0'); // place holder for offset to next item |
1125 | 0 | data.write(p->sl_size); |
1126 | |
|
1127 | 0 | } |
1128 | | |
1129 | 0 | if (strncmp(prev_sl, p->sl, 3) != 0) { |
1130 | | |
1131 | 0 | Jump jump; |
1132 | 0 | strncpy(jump.sl, p->sl, 3); |
1133 | 0 | jump.loc = data.size(); |
1134 | 0 | jump2.push_back(jump); |
1135 | | |
1136 | 0 | if (strncmp(prev_sl, p->sl, 2) != 0) { |
1137 | 0 | Jump jump; |
1138 | 0 | strncpy(jump.sl, p->sl, 2); |
1139 | 0 | jump.loc = jump2.size() - 1; |
1140 | 0 | jump1.push_back(jump); |
1141 | 0 | } |
1142 | |
|
1143 | 0 | data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos - head_size + 1); |
1144 | | // when advanced to this position the offset byte will |
1145 | | // be null (since it will point to the null terminator |
1146 | | // of the last word) and will thus signal the end of the |
1147 | | // group |
1148 | | |
1149 | 0 | } else { |
1150 | | |
1151 | 0 | data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos); |
1152 | | |
1153 | 0 | } |
1154 | | |
1155 | 0 | prev_pos = data.size(); |
1156 | 0 | prev_sl = p->sl; |
1157 | |
|
1158 | 0 | if (invisible_soundslike) { |
1159 | | |
1160 | 0 | unsigned pos = data.size(); |
1161 | 0 | prev_w_pos = data.size(); |
1162 | 0 | data.write(p->word, p->word_size + 1); |
1163 | 0 | if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1); |
1164 | 0 | lookup.insert(pos); |
1165 | |
|
1166 | 0 | p = p->next; |
1167 | |
|
1168 | 0 | } else { |
1169 | |
|
1170 | 0 | data.write(p->sl, p->sl_size + 1); |
1171 | | |
1172 | | // write all word entries with the same soundslike |
1173 | |
|
1174 | 0 | do { |
1175 | 0 | data.write(p->flags); |
1176 | 0 | data.write(p->data_size + 3); |
1177 | 0 | data.write(p->word_size); |
1178 | |
|
1179 | 0 | unsigned pos = data.size(); |
1180 | 0 | data[prev_w_pos - NEXT_O] = (byte)(pos - prev_w_pos); |
1181 | 0 | data.write(p->word, p->word_size + 1); |
1182 | 0 | if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1); |
1183 | 0 | lookup.insert(pos); |
1184 | |
|
1185 | 0 | prev_w_pos = pos; |
1186 | 0 | prev_sl = p->sl; |
1187 | |
|
1188 | 0 | p = p->next; |
1189 | |
|
1190 | 0 | } while (p && prev_sl == p->sl); // yes I really mean to use pointer compare here |
1191 | 0 | } |
1192 | 0 | } |
1193 | | |
1194 | | // add special end case |
1195 | 0 | if (data.size() % 2 != 0) data.write('\0'); |
1196 | 0 | data.write16(0); |
1197 | 0 | data.write16(0); |
1198 | 0 | data[prev_pos - NEXT_O] |= (byte)(data.size() - prev_pos); |
1199 | | |
1200 | 0 | jump2.push_back(Jump()); |
1201 | 0 | jump1.push_back(Jump()); |
1202 | | |
1203 | 0 | data.write(0); |
1204 | 0 | data.write(0); |
1205 | 0 | data.write(0); |
1206 | |
|
1207 | 0 | if (invisible_soundslike) |
1208 | 0 | data_head.first_word_offset = data[4 - NEXT_O] + 4; |
1209 | 0 | else |
1210 | 0 | data_head.first_word_offset = data[8 - NEXT_O] + 8; |
1211 | |
|
1212 | 0 | memset(data.data(), 0, 8); |
1213 | | |
1214 | | //CERR.printf("%d == %d\n", lookup.size(), uniq_entries); |
1215 | | //assert(lookup.size() == uniq_entries); |
1216 | |
|
1217 | 0 | data_head.word_count = num_entries; |
1218 | 0 | data_head.word_buckets = lookup.bucket_count(); |
1219 | |
|
1220 | 0 | FStream out; |
1221 | 0 | out.open(base, "wb"); |
1222 | |
|
1223 | 0 | advance_file(out, data_head.head_size); |
1224 | | |
1225 | | // Write jump1 table |
1226 | 0 | data_head.jump1_offset = out.tell() - data_head.head_size; |
1227 | 0 | out.write(jump1.data(), jump1.size() * sizeof(Jump)); |
1228 | | |
1229 | | // Write jump2 table |
1230 | 0 | advance_file(out, round_up(out.tell(), DataHead::align)); |
1231 | 0 | data_head.jump2_offset = out.tell() - data_head.head_size; |
1232 | 0 | out.write(jump2.data(), jump2.size() * sizeof(Jump)); |
1233 | | |
1234 | | // Write data block |
1235 | 0 | advance_file(out, round_up(out.tell(), DataHead::align)); |
1236 | 0 | data_head.word_offset = out.tell() - data_head.head_size; |
1237 | 0 | out.write(data.data(), data.size()); |
1238 | | |
1239 | | // Write hash |
1240 | 0 | advance_file(out, round_up(out.tell(), DataHead::align)); |
1241 | 0 | data_head.hash_offset = out.tell() - data_head.head_size; |
1242 | 0 | out.write(&lookup.vector().front(), lookup.vector().size() * 4); |
1243 | | |
1244 | | // calculate block size |
1245 | 0 | advance_file(out, round_up(out.tell(), DataHead::align)); |
1246 | 0 | data_head.block_size = out.tell() - data_head.head_size; |
1247 | | |
1248 | | // write data head to file |
1249 | 0 | out.seek(0); |
1250 | 0 | out.write(&data_head, sizeof(DataHead)); |
1251 | 0 | out.write(" ", 1); |
1252 | 0 | out.write(lang.name(), data_head.lang_name_size); |
1253 | 0 | out.write(lang.soundslike_name(), data_head.soundslike_name_size); |
1254 | 0 | out.write(lang.soundslike_version(), data_head.soundslike_version_size); |
1255 | |
|
1256 | 0 | return no_err; |
1257 | 0 | } |
1258 | | |
1259 | | } |
1260 | | |
1261 | | namespace aspeller { |
1262 | | PosibErr<void> create_default_readonly_dict(StringEnumeration * els, |
1263 | | Config & config) |
1264 | 0 | { |
1265 | 0 | CachePtr<Language> lang; |
1266 | 0 | PosibErr<Language *> res = new_language(config); |
1267 | 0 | if (res.has_err()) return res; |
1268 | 0 | lang.reset(res.data); |
1269 | 0 | lang->set_lang_defaults(config); |
1270 | 0 | RET_ON_ERR(create(els,*lang,config)); |
1271 | 0 | return no_err; |
1272 | 0 | } |
1273 | | } |
1274 | | |