Coverage Report

Created: 2026-01-16 07:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/hunspell/src/hunspell/hashmgr.cxx
Line
Count
Source
1
/* ***** BEGIN LICENSE BLOCK *****
2
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3
 *
4
 * Copyright (C) 2002-2022 Németh László
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version
7
 * 1.1 (the "License"); you may not use this file except in compliance with
8
 * the License. You may obtain a copy of the License at
9
 * http://www.mozilla.org/MPL/
10
 *
11
 * Software distributed under the License is distributed on an "AS IS" basis,
12
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13
 * for the specific language governing rights and limitations under the
14
 * License.
15
 *
16
 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17
 *
18
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23
 *
24
 * Alternatively, the contents of this file may be used under the terms of
25
 * either the GNU General Public License Version 2 or later (the "GPL"), or
26
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
 * in which case the provisions of the GPL or the LGPL are applicable instead
28
 * of those above. If you wish to allow use of your version of this file only
29
 * under the terms of either the GPL or the LGPL, and not to allow others to
30
 * use your version of this file under the terms of the MPL, indicate your
31
 * decision by deleting the provisions above and replace them with the notice
32
 * and other provisions required by the GPL or the LGPL. If you do not delete
33
 * the provisions above, a recipient may use your version of this file under
34
 * the terms of any one of the MPL, the GPL or the LGPL.
35
 *
36
 * ***** END LICENSE BLOCK ***** */
37
/*
38
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39
 * And Contributors.  All rights reserved.
40
 *
41
 * Redistribution and use in source and binary forms, with or without
42
 * modification, are permitted provided that the following conditions
43
 * are met:
44
 *
45
 * 1. Redistributions of source code must retain the above copyright
46
 *    notice, this list of conditions and the following disclaimer.
47
 *
48
 * 2. Redistributions in binary form must reproduce the above copyright
49
 *    notice, this list of conditions and the following disclaimer in the
50
 *    documentation and/or other materials provided with the distribution.
51
 *
52
 * 3. All modifications to the source code must be clearly marked as
53
 *    such.  Binary redistributions based on modified source code
54
 *    must be clearly marked as modified versions in the documentation
55
 *    and/or other materials provided with the distribution.
56
 *
57
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68
 * SUCH DAMAGE.
69
 */
70
71
#include <cstdlib>
72
#include <cstring>
73
#include <cstdio>
74
#include <cctype>
75
#include <limits>
76
#include <sstream>
77
#if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
78
#include <bit>
79
#endif
80
81
#include "hashmgr.hxx"
82
#include "csutil.hxx"
83
#include "atypes.hxx"
84
#include "langnum.hxx"
85
86
// build a hash table from a munched word list
87
88
HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
89
15.2k
    : flag_mode(FLAG_CHAR),
90
15.2k
      complexprefixes(0),
91
15.2k
      utf8(0),
92
15.2k
      forbiddenword(FORBIDDENWORD), // forbidden word signing flag
93
15.2k
      langnum(0),
94
15.2k
      csconv(NULL)
95
15.2k
{
96
15.2k
  load_config(apath, key);
97
15.2k
  if (!csconv)
98
14.5k
    csconv = get_current_cs(SPELL_ENCODING);
99
15.2k
  int ec = load_tables(tpath, key);
100
15.2k
  if (ec) {
101
    /* error condition - what should we do here */
102
7.85k
    HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
103
7.85k
    free_table();
104
    //keep table size to 1 to fix possible division with zero
105
7.85k
    tableptr.resize(1, nullptr);
106
7.85k
  }
107
15.2k
}
108
109
489k
void HashMgr::free_flag(unsigned short* astr, int alen) {
110
489k
  if (astr && (aliasf.empty() || TESTAFF(astr, ONLYUPCASEFLAG, alen)))
111
188k
    delete[] astr;
112
489k
}
113
114
23.0k
void HashMgr::free_table() {
115
  // now pass through hash table freeing up everything
116
  // go through column by column of the table
117
9.83M
  for (auto ptr : tableptr) {
118
9.83M
    hentry* nt = NULL;
119
10.3M
    while (ptr) {
120
489k
      nt = ptr->next;
121
489k
      free_flag(ptr->astr, ptr->alen);
122
489k
      free(ptr);
123
489k
      ptr = nt;
124
489k
    }
125
9.83M
  }
126
23.0k
  tableptr.clear();
127
23.0k
}
128
129
15.2k
HashMgr::~HashMgr() {
130
15.2k
  free_table();
131
132
15.2k
  for (auto& j : aliasf)
133
507
    delete[] j;
134
15.2k
  aliasf.clear();
135
136
15.2k
  for (auto& j : aliasm)
137
731
    delete[] j;
138
15.2k
  aliasm.clear();
139
140
#ifdef MOZILLA_CLIENT
141
  delete[] csconv;
142
#endif
143
15.2k
}
144
145
// lookup a root word in the hashtable
146
147
2.82G
struct hentry* HashMgr::lookup(const char* word, size_t len) const {
148
2.82G
  struct hentry* dp = tableptr[hash(word, len)];
149
2.82G
  if (!dp)
150
2.43G
    return NULL;
151
824M
  for (; dp != NULL; dp = dp->next) {
152
593M
    if (strcmp(word, dp->word) == 0)
153
161M
      return dp;
154
593M
  }
155
231M
  return NULL;
156
393M
}
157
158
// add a word to the hash table (private)
159
int HashMgr::add_word(const std::string& in_word,
160
                      int wcl,
161
                      unsigned short* aff,
162
                      int al,
163
                      const std::string* in_desc,
164
                      bool onlyupcase,
165
544k
                      int captype) {
166
167
544k
  if (al > std::numeric_limits<short>::max()) {
168
7
    HUNSPELL_WARNING(stderr, "error: affix len %d is over max limit\n", al);
169
7
    free_flag(aff, al);
170
7
    return 1;
171
7
  }
172
173
544k
  const std::string* word = &in_word;
174
544k
  const std::string* desc = in_desc;
175
176
544k
  std::string *word_copy = NULL;
177
544k
  std::string *desc_copy = NULL;
178
544k
  if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) {
179
298k
    word_copy = new std::string(in_word);
180
181
298k
    if (!ignorechars.empty()) {
182
105k
      if (utf8) {
183
30.7k
        wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
184
74.6k
      } else {
185
74.6k
        remove_ignored_chars(*word_copy, ignorechars);
186
74.6k
      }
187
105k
    }
188
189
298k
    if (complexprefixes) {
190
253k
      if (utf8)
191
32.6k
        wcl = reverseword_utf(*word_copy);
192
221k
      else
193
221k
        reverseword(*word_copy);
194
195
253k
      if (in_desc && aliasm.empty()) {
196
33.1k
        desc_copy = new std::string(*in_desc);
197
198
33.1k
        if (complexprefixes) {
199
33.1k
          if (utf8)
200
5.94k
            reverseword_utf(*desc_copy);
201
27.1k
          else
202
27.1k
            reverseword(*desc_copy);
203
33.1k
        }
204
33.1k
        desc = desc_copy;
205
33.1k
      }
206
253k
    }
207
208
298k
    word = word_copy;
209
298k
  }
210
211
  // limit of hp->blen
212
544k
  if (word->size() > std::numeric_limits<unsigned short>::max()) {
213
5
    HUNSPELL_WARNING(stderr, "error: word len %ld is over max limit\n", word->size());
214
5
    delete desc_copy;
215
5
    delete word_copy;
216
5
    free_flag(aff, al);
217
5
    return 1;
218
5
  }
219
220
544k
  bool upcasehomonym = false;
221
544k
  int descl = desc ? (!aliasm.empty() ? sizeof(char*) : desc->size() + 1) : 0;
222
  // variable-length hash record with word and optional fields
223
544k
  auto hp =
224
544k
      (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl);
225
544k
  if (!hp) {
226
0
    delete desc_copy;
227
0
    delete word_copy;
228
0
    free_flag(aff, al);
229
0
    return 1;
230
0
  }
231
232
544k
  char* hpw = hp->word;
233
544k
  memcpy(hpw, word->data(), word->size());
234
544k
  hpw[word->size()] = 0;
235
236
544k
  int i = hash(hpw, word->size());
237
238
544k
  hp->blen = (unsigned short)word->size();
239
544k
  hp->clen = (unsigned short)wcl;
240
544k
  hp->alen = (short)al;
241
544k
  hp->astr = aff;
242
544k
  hp->next = NULL;
243
544k
  hp->next_homonym = NULL;
244
544k
  hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;
245
246
  // store the description string or its pointer
247
544k
  if (desc) {
248
101k
    hp->var |= H_OPT;
249
101k
    if (!aliasm.empty()) {
250
17.4k
      hp->var |= H_OPT_ALIASM;
251
17.4k
      store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
252
83.7k
    } else {
253
83.7k
      strcpy(hpw + word->size() + 1, desc->c_str());
254
83.7k
    }
255
101k
    if (HENTRY_FIND(hp, MORPH_PHON)) {
256
28.0k
      hp->var |= H_OPT_PHON;
257
      // store ph: fields (pronounciation, misspellings, old orthography etc.)
258
      // of a morphological description in reptable to use in REP replacements.
259
28.0k
      size_t predicted = tableptr.size() / MORPH_PHON_RATIO;
260
28.0k
      if (reptable.capacity() < predicted)
261
1.25k
          reptable.reserve(predicted);
262
28.0k
      std::string fields = HENTRY_DATA(hp);
263
28.0k
      std::string::const_iterator iter = fields.begin(), start_piece = mystrsep(fields, iter);
264
93.4k
      while (start_piece != fields.end()) {
265
65.3k
        if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
266
27.9k
          std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
267
27.9k
          if (!ph.empty()) {
268
27.2k
            std::vector<w_char> w;
269
27.2k
            size_t strippatt;
270
27.2k
            std::string wordpart;
271
            // dictionary based REP replacement, separated by "->"
272
            // for example "pretty ph:prity ph:priti->pretti" to handle
273
            // both prity -> pretty and pritier -> prettiest suggestions.
274
27.2k
            if (((strippatt = ph.find("->")) != std::string::npos) &&
275
5.77k
                    (strippatt > 0) && (strippatt < ph.size() - 2)) {
276
5.68k
                wordpart = ph.substr(strippatt + 2);
277
5.68k
                ph.erase(ph.begin() + strippatt, ph.end());
278
5.68k
            } else
279
21.5k
                wordpart = in_word;
280
            // when the ph: field ends with the character *,
281
            // strip last character of the pattern and the replacement
282
            // to match in REP suggestions also at character changes,
283
            // for example, "pretty ph:prity*" results "prit->prett"
284
            // REP replacement instead of "prity->pretty", to get
285
            // prity->pretty and pritiest->prettiest suggestions.
286
27.2k
            if (ph.at(ph.size()-1) == '*') {
287
7.34k
              strippatt = 1;
288
7.34k
              size_t stripword = 0;
289
7.34k
              if (utf8) {
290
19.3k
                while ((strippatt < ph.size()) &&
291
18.9k
                  ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
292
13.1k
                     ++strippatt;
293
7.79k
                while ((stripword < wordpart.size()) &&
294
7.33k
                  ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
295
1.57k
                     ++stripword;
296
6.21k
              }
297
7.34k
              ++strippatt;
298
7.34k
              ++stripword;
299
7.34k
              if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
300
6.23k
                ph.erase(ph.size()-strippatt, strippatt);
301
6.23k
                wordpart.erase(wordpart.size()-stripword, stripword);
302
6.23k
              }
303
7.34k
            }
304
            // capitalize lowercase pattern for capitalized words to support
305
            // good suggestions also for capitalized misspellings, eg.
306
            // Wednesday ph:wendsay
307
            // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
308
27.2k
            if (captype == INITCAP) {
309
8.61k
              std::string ph_capitalized;
310
8.61k
              if (utf8) {
311
5.89k
                u8_u16(w, ph);
312
5.89k
                if (get_captype_utf8(w, langnum) == NOCAP) {
313
5.24k
                  mkinitcap_utf(w, langnum);
314
5.24k
                  u16_u8(ph_capitalized, w);
315
5.24k
                }
316
5.89k
              } else if (get_captype(ph, csconv) == NOCAP)
317
2.05k
                  mkinitcap(ph_capitalized, csconv);
318
319
8.61k
              if (!ph_capitalized.empty()) {
320
                // add also lowercase word in the case of German or
321
                // Hungarian to support lowercase suggestions lowercased by
322
                // compound word generation or derivational suffixes
323
                // (for example by adjectival suffix "-i" of geographical
324
                // names in Hungarian:
325
                // Massachusetts ph:messzecsuzec
326
                // messzecsuzeci -> massachusettsi (adjective)
327
                // For lowercasing by conditional PFX rules, see
328
                // tests/germancompounding test example or the
329
                // Hungarian dictionary.)
330
5.24k
                if (langnum == LANG_de || langnum == LANG_hu) {
331
3.21k
                  std::string wordpart_lower(wordpart);
332
3.21k
                  if (utf8) {
333
3.21k
                    u8_u16(w, wordpart_lower);
334
3.21k
                    mkallsmall_utf(w, langnum);
335
3.21k
                    u16_u8(wordpart_lower, w);
336
3.21k
                  } else {
337
0
                    mkallsmall(wordpart_lower, csconv);
338
0
                  }
339
3.21k
                  reptable.emplace_back();
340
3.21k
                  reptable.back().pattern.assign(ph);
341
3.21k
                  reptable.back().outstrings[0].assign(wordpart_lower);
342
3.21k
                }
343
5.24k
                reptable.emplace_back();
344
5.24k
                reptable.back().pattern.assign(ph_capitalized);
345
5.24k
                reptable.back().outstrings[0].assign(wordpart);
346
5.24k
              }
347
8.61k
            }
348
27.2k
            reptable.emplace_back();
349
27.2k
            reptable.back().pattern.assign(ph);
350
27.2k
            reptable.back().outstrings[0].assign(wordpart);
351
27.2k
          }
352
27.9k
        }
353
65.3k
        start_piece = mystrsep(fields, iter);
354
65.3k
      }
355
28.0k
    }
356
101k
  }
357
358
544k
  struct hentry* dp = tableptr[i];
359
544k
  if (!dp) {
360
175k
    tableptr[i] = hp;
361
175k
    delete desc_copy;
362
175k
    delete word_copy;
363
175k
    return 0;
364
175k
  }
365
22.3M
  while (dp->next != NULL) {
366
21.9M
    if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
367
      // remove hidden onlyupcase homonym
368
22.2k
      if (!onlyupcase) {
369
13.0k
        if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
370
221
          delete[] dp->astr;
371
221
          dp->astr = hp->astr;
372
221
          dp->alen = hp->alen;
373
221
          free(hp);
374
221
          delete desc_copy;
375
221
          delete word_copy;
376
221
          return 0;
377
12.8k
        } else {
378
12.8k
          dp->next_homonym = hp;
379
12.8k
        }
380
13.0k
      } else {
381
9.21k
        upcasehomonym = true;
382
9.21k
      }
383
22.2k
    }
384
21.9M
    dp = dp->next;
385
21.9M
  }
386
369k
  if (strcmp(hp->word, dp->word) == 0) {
387
    // remove hidden onlyupcase homonym
388
237k
    if (!onlyupcase) {
389
192k
      if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
390
738
        delete[] dp->astr;
391
738
        dp->astr = hp->astr;
392
738
        dp->alen = hp->alen;
393
738
        free(hp);
394
738
        delete desc_copy;
395
738
        delete word_copy;
396
738
        return 0;
397
191k
      } else {
398
191k
        dp->next_homonym = hp;
399
191k
      }
400
192k
    } else {
401
45.0k
      upcasehomonym = true;
402
45.0k
    }
403
237k
  }
404
368k
  if (!upcasehomonym) {
405
314k
    dp->next = hp;
406
314k
  } else {
407
    // remove hidden onlyupcase homonym
408
54.2k
    delete[] hp->astr;
409
54.2k
    free(hp);
410
54.2k
  }
411
412
368k
  delete desc_copy;
413
368k
  delete word_copy;
414
368k
  return 0;
415
369k
}
416
417
int HashMgr::add_hidden_capitalized_word(const std::string& word,
418
                                         int wcl,
419
                                         unsigned short* flags,
420
                                         int flagslen,
421
                                         const std::string* dp,
422
386k
                                         int captype) {
423
386k
  if (flags == NULL)
424
300k
    flagslen = 0;
425
426
  // add inner capitalized forms to handle the following allcap forms:
427
  // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
428
  // Allcaps with suffixes: CIA's -> CIA'S
429
386k
  if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
430
242k
       ((captype == ALLCAP) && (flagslen != 0))) &&
431
159k
      !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
432
158k
    unsigned short* flags2 = new unsigned short[flagslen + 1];
433
158k
    flags2[flagslen] = ONLYUPCASEFLAG;
434
158k
    if (flagslen) {
435
60.2k
      memcpy(flags2, flags, flagslen * sizeof(unsigned short));
436
60.2k
      std::sort(flags2, flags2 + flagslen + 1);
437
60.2k
    }
438
158k
    if (utf8) {
439
28.3k
      std::string st;
440
28.3k
      std::vector<w_char> w;
441
28.3k
      u8_u16(w, word);
442
28.3k
      mkallsmall_utf(w, langnum);
443
28.3k
      mkinitcap_utf(w, langnum);
444
28.3k
      u16_u8(st, w);
445
28.3k
      return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
446
129k
    } else {
447
129k
      std::string new_word(word);
448
129k
      mkallsmall(new_word, csconv);
449
129k
      mkinitcap(new_word, csconv);
450
129k
      int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
451
129k
      return ret;
452
129k
    }
453
158k
  }
454
228k
  return 0;
455
386k
}
456
457
// detect captype and modify word length for UTF-8 encoding
458
386k
int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
459
386k
  int len;
460
386k
  if (utf8) {
461
112k
    len = u8_u16(workbuf, word);
462
112k
    *captype = get_captype_utf8(workbuf, langnum);
463
273k
  } else {
464
273k
    len = word.size();
465
273k
    *captype = get_captype(word, csconv);
466
273k
  }
467
386k
  return len;
468
386k
}
469
470
0
int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
471
0
  std::vector<w_char> workbuf;
472
0
  return get_clen_and_captype(word, captype, workbuf);
473
0
}
474
475
// remove word (personal dictionary function for standalone applications)
476
0
int HashMgr::remove(const std::string& word) {
477
0
  struct hentry* dp = lookup(word.c_str(), word.size());
478
0
  while (dp) {
479
0
    if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
480
0
      auto flags = new unsigned short[dp->alen + 1];
481
0
      for (int i = 0; i < dp->alen; i++)
482
0
        flags[i] = dp->astr[i];
483
0
      flags[dp->alen] = forbiddenword;
484
0
      delete[] dp->astr;
485
0
      dp->astr = flags;
486
0
      dp->alen++;
487
0
      std::sort(flags, flags + dp->alen);
488
0
    }
489
0
    dp = dp->next_homonym;
490
0
  }
491
0
  return 0;
492
0
}
493
494
/* remove forbidden flag to add a personal word to the hash */
495
0
void HashMgr::remove_forbidden_flag(const std::string& word) {
496
0
  struct hentry* dp = lookup(word.c_str(), word.size());
497
0
  if (!dp)
498
0
    return;
499
0
  while (dp) {
500
0
    if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen))
501
0
      dp->alen = 0;  // XXX forbidden words of personal dic.
502
0
    dp = dp->next_homonym;
503
0
  }
504
0
}
505
506
// add a custom dic. word to the hash table (public)
507
0
int HashMgr::add(const std::string& word) {
508
0
  remove_forbidden_flag(word);
509
0
  int captype, al = 0;
510
0
  unsigned short* flags = NULL;
511
0
  int wcl = get_clen_and_captype(word, &captype);
512
0
  add_word(word, wcl, flags, al, NULL, false, captype);
513
0
  return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
514
0
                                     captype);
515
0
}
516
517
0
int HashMgr::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) {
518
0
  remove_forbidden_flag(word);
519
0
  int captype;
520
0
  unsigned short *df;
521
0
  int al = decode_flags(&df, flags, NULL);
522
0
  int wcl = get_clen_and_captype(word, &captype);
523
0
  add_word(word, wcl, df, al, &desc, false, captype);
524
0
  return add_hidden_capitalized_word(word, wcl, df, al, &desc, captype);
525
0
}
526
527
0
int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
528
  // detect captype and modify word length for UTF-8 encoding
529
0
  struct hentry* dp = lookup(example.c_str(), example.size());
530
0
  remove_forbidden_flag(word);
531
0
  if (dp && dp->astr) {
532
0
    int captype;
533
0
    int wcl = get_clen_and_captype(word, &captype);
534
0
    if (!aliasf.empty()) {
535
0
      add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
536
0
    } else {
537
0
      auto flags = new unsigned short[dp->alen];
538
0
      memcpy(flags, dp->astr, dp->alen * sizeof(unsigned short));
539
0
      add_word(word, wcl, flags, dp->alen, NULL, false, captype);
540
0
    }
541
0
    return add_hidden_capitalized_word(word, wcl, dp->astr,
542
0
                                       dp->alen, NULL, captype);
543
0
  }
544
0
  return 1;
545
0
}
546
547
// walk the hash table entry by entry - null at end
548
// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
549
3.84M
struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
550
3.84M
  if (hp && hp->next != NULL)
551
1.89M
    return hp->next;
552
77.7M
  for (col++; col < (int)tableptr.size(); ++col) {
553
77.7M
    if (tableptr[col])
554
1.86M
      return tableptr[col];
555
77.7M
  }
556
  // null at end and reset to start
557
91.5k
  col = -1;
558
91.5k
  return NULL;
559
1.95M
}
560
561
// load a munched word list and build a hash table on the fly
562
15.2k
int HashMgr::load_tables(const char* tpath, const char* key) {
563
  // open dictionary file
564
15.2k
  FileMgr* dict = new FileMgr(tpath, key);
565
15.2k
  if (dict == NULL)
566
0
    return 1;
567
568
  // first read the first line of file to get hash table size
569
15.2k
  std::string ts;
570
15.2k
  if (!dict->getline(ts)) {
571
343
    HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
572
343
    delete dict;
573
343
    return 2;
574
343
  }
575
14.8k
  mychomp(ts);
576
577
  /* remove byte order mark */
578
14.8k
  if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
579
1
    ts.erase(0, 3);
580
1
  }
581
582
14.8k
  int tablesize = atoi(ts.c_str());
583
584
14.8k
  const int nExtra = 5 + USERWORD;
585
#if !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
586
  const int max_allowed = (std::numeric_limits<int>::max() - 1 - nExtra) / int(sizeof(struct hentry*));
587
#else
588
14.8k
  const int max_allowed = (100000 - 1 - nExtra) / int(sizeof(struct hentry*));
589
14.8k
#endif
590
591
14.8k
  if (tablesize <= 0 || tablesize >= max_allowed) {
592
7.45k
    HUNSPELL_WARNING(
593
7.45k
        stderr, "error: line 1: missing or bad word count in the dic file\n");
594
7.45k
    delete dict;
595
7.45k
    return 4;
596
7.45k
  }
597
7.43k
  tablesize += nExtra;
598
7.43k
  if ((tablesize & 1) == 0)
599
4.94k
    tablesize++;
600
601
  // allocate the hash table
602
7.43k
  tableptr.resize(tablesize, nullptr);
603
604
  // loop through all words on much list and add to hash
605
  // table and create word and affix strings
606
607
7.43k
  std::vector<w_char> workbuf;
608
609
7.43k
  int nLineCount(0);
610
394k
  while (dict->getline(ts)) {
611
386k
    ++nLineCount;
612
386k
    mychomp(ts);
613
    // split each line into word and morphological description
614
386k
    size_t dp_pos = 0;
615
416k
    while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
616
45.6k
      if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
617
17.0k
        for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
618
1.83k
          ;
619
15.2k
        if (dp_pos == 0) {  // missing word
620
354
          dp_pos = std::string::npos;
621
14.8k
        } else {
622
14.8k
          ++dp_pos;
623
14.8k
        }
624
15.2k
        break;
625
15.2k
      }
626
30.4k
      ++dp_pos;
627
30.4k
    }
628
629
    // tabulator is the old morphological field separator
630
386k
    size_t dp2_pos = ts.find('\t');
631
386k
    if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
632
71.7k
      dp_pos = dp2_pos + 1;
633
71.7k
    }
634
635
386k
    std::string dp;
636
386k
    if (dp_pos != std::string::npos) {
637
76.0k
      dp.assign(ts.substr(dp_pos));
638
76.0k
      ts.resize(dp_pos - 1);
639
76.0k
    }
640
641
    // split each line into word and affix char strings
642
    // "\/" signs slash in words (not affix separator)
643
    // "/" at beginning of the line is word character (not affix separator)
644
386k
    size_t ap_pos = ts.find('/');
645
391k
    while (ap_pos != std::string::npos) {
646
101k
      if (ap_pos == 0) {
647
3.97k
        ++ap_pos;
648
3.97k
        continue;
649
97.5k
      } else if (ts[ap_pos - 1] != '\\')
650
96.8k
        break;
651
      // replace "\/" with "/"
652
649
      ts.erase(ap_pos - 1, 1);
653
649
      ap_pos = ts.find('/', ap_pos);
654
649
    }
655
656
386k
    unsigned short* flags;
657
386k
    int al;
658
386k
    if (ap_pos != std::string::npos && ap_pos != ts.size()) {
659
96.2k
      std::string ap(ts.substr(ap_pos + 1));
660
96.2k
      ts.resize(ap_pos);
661
96.2k
      if (!aliasf.empty()) {
662
4.04k
        int index = atoi(ap.c_str());
663
4.04k
        al = get_aliasf(index, &flags, dict);
664
4.04k
        if (!al) {
665
3.00k
          HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
666
3.00k
                           dict->getlinenum());
667
3.00k
        }
668
92.2k
      } else {
669
92.2k
        al = decode_flags(&flags, ap, dict);
670
92.2k
        if (al == -1) {
671
0
          HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
672
0
          delete dict;
673
0
          return 6;
674
0
        }
675
92.2k
        std::sort(flags, flags + al);
676
92.2k
      }
677
290k
    } else {
678
290k
      al = 0;
679
290k
      flags = NULL;
680
290k
    }
681
682
386k
    int captype;
683
386k
    int wcl = get_clen_and_captype(ts, &captype, workbuf);
684
386k
    const std::string *dp_str = dp.empty() ? NULL : &dp;
685
    // add the word and its index plus its capitalized form optionally
686
386k
    if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
687
386k
        add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
688
12
      delete dict;
689
12
      return 5;
690
12
    }
691
386k
  }
692
693
7.42k
  int ret(0);
694
695
  // reject ludicrous tablesizes
696
7.42k
  if (tablesize > 8192 + nExtra && tablesize > nLineCount * 10 + nExtra) {
697
42
    HUNSPELL_WARNING(stderr, ".dic initial approximate word count line value of %d is too large for %d lines\n", tablesize, nLineCount);
698
42
    ret = 3;
699
42
  }
700
701
7.42k
  delete dict;
702
7.42k
  return ret;
703
7.43k
}
704
705
// the hash function is a simple load and rotate
706
// algorithm borrowed
707
2.82G
int HashMgr::hash(const char* word, size_t len) const {
708
2.82G
  unsigned long hv = 0;
709
2.82G
  size_t i = 0;
710
13.1G
  while (i < 4 && i < len)
711
10.3G
    hv = (hv << 8) | word[i++];
712
138G
  while (i < len) {
713
136G
    ROTATE(hv, ROTATE_LEN);
714
136G
    hv ^= word[i++];
715
136G
  }
716
2.82G
  return (unsigned long)hv % tableptr.size();
717
2.82G
}
718
719
111k
int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const {
720
111k
  int len;
721
111k
  if (flags.empty()) {
722
11.8k
    *result = NULL;
723
11.8k
    return 0;
724
11.8k
  }
725
99.8k
  switch (flag_mode) {
726
10.2k
    case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
727
10.2k
      len = flags.size();
728
10.2k
      if ((len & 1) == 1 && af != NULL)
729
5.67k
        HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
730
5.67k
                         af->getlinenum());
731
10.2k
      len >>= 1;
732
10.2k
      *result = new unsigned short[len];
733
153k
      for (int i = 0; i < len; i++) {
734
143k
        unsigned short flag = ((unsigned short)((unsigned char)flags[i << 1]) << 8) |
735
143k
                              ((unsigned short)((unsigned char)flags[(i << 1) | 1]));
736
737
143k
        if (flag >= DEFAULTFLAGS && af != NULL) {
738
9.40k
          HUNSPELL_WARNING(stderr,
739
9.40k
                           "error: line %d: flag id %d is too large (max: %d)\n",
740
9.40k
                           af->getlinenum(), flag, DEFAULTFLAGS - 1);
741
9.40k
          flag = 0;
742
9.40k
        }
743
744
143k
        (*result)[i] = flag;
745
143k
      }
746
10.2k
      break;
747
0
    }
748
4.50k
    case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
749
                      // 23 233)
750
3.32M
      len = int(1 + std::count_if(flags.begin(), flags.end(), [](char c) { return c == ','; }));
751
4.50k
      *result = new unsigned short[len];
752
4.50k
      unsigned short* dest = *result;
753
4.50k
      const char* src = flags.c_str();
754
3.33M
      for (size_t p = 0; p < flags.size(); ++p) {
755
3.32M
        if (flags[p] == ',') {
756
27.5k
          int i = atoi(src);
757
27.5k
          if ((i >= DEFAULTFLAGS || i < 0) && af != NULL) {
758
1.53k
            HUNSPELL_WARNING(
759
1.53k
                stderr, "error: line %d: flag id %d is too large (max: %d)\n",
760
1.53k
                af->getlinenum(), i, DEFAULTFLAGS - 1);
761
1.53k
             i = 0;
762
1.53k
    }
763
27.5k
          *dest = (unsigned short)i;
764
27.5k
          if (*dest == 0 && af != NULL)
765
25.3k
            HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
766
25.3k
                             af->getlinenum());
767
27.5k
          src = flags.c_str() + p + 1;
768
27.5k
          dest++;
769
27.5k
        }
770
3.32M
      }
771
4.50k
      int i = atoi(src);
772
4.50k
      if (i >= DEFAULTFLAGS || i < 0) {
773
432
        HUNSPELL_WARNING(stderr,
774
432
                         "error: line %d: flag id %d is too large (max: %d)\n",
775
432
                         af->getlinenum(), i, DEFAULTFLAGS - 1);
776
432
        i = 0;
777
432
      }
778
4.50k
      *dest = (unsigned short)i;
779
4.50k
      if (*dest == 0)
780
3.24k
        HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
781
3.24k
                         af->getlinenum());
782
4.50k
      break;
783
0
    }
784
11.5k
    case FLAG_UNI: {  // UTF-8 characters
785
11.5k
      std::vector<w_char> w;
786
11.5k
      u8_u16(w, flags);
787
11.5k
      len = w.size();
788
11.5k
      *result = new unsigned short[len];
789
#if defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
790
      memcpy(*result, w.data(), len * sizeof(unsigned short));
791
#else
792
11.5k
      unsigned short* dest = *result;
793
433k
      for (const w_char wc : w) {
794
433k
        *dest = (unsigned short)wc;
795
433k
        dest++;
796
433k
      }
797
11.5k
#endif
798
11.5k
      break;
799
0
    }
800
73.4k
    default: {  // Ispell's one-character flags (erfg -> e r f g)
801
73.4k
      len = flags.size();
802
73.4k
      *result = new unsigned short[len];
803
73.4k
      unsigned short* dest = *result;
804
4.93M
      for (const char flag : flags) {
805
4.93M
        *dest = (unsigned char)flag;
806
4.93M
        dest++;
807
4.93M
      }
808
73.4k
    }
809
99.8k
  }
810
99.8k
  return len;
811
99.8k
}
812
813
68.7k
bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
814
68.7k
  if (flags.empty()) {
815
91
    return false;
816
91
  }
817
68.6k
  switch (flag_mode) {
818
24.1k
    case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
819
24.1k
      size_t len = flags.size();
820
24.1k
      if ((len & 1) == 1)
821
23.4k
        HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
822
23.4k
                         af->getlinenum());
823
24.1k
      len >>= 1;
824
24.1k
      result.reserve(result.size() + len);
825
160k
      for (size_t i = 0; i < len; ++i) {
826
136k
        result.push_back(((unsigned short)((unsigned char)flags[i << 1]) << 8) |
827
136k
                     ((unsigned short)((unsigned char)flags[(i << 1) | 1])));
828
136k
      }
829
24.1k
      break;
830
0
    }
831
5.69k
    case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
832
                      // 23 233)
833
5.69k
      const char* src = flags.c_str();
834
48.0k
      for (const char* p = src; *p; p++) {
835
42.3k
        if (*p == ',') {
836
3.13k
          int i = atoi(src);
837
3.13k
          if (i >= DEFAULTFLAGS) {
838
589
            HUNSPELL_WARNING(
839
589
                stderr, "error: line %d: flag id %d is too large (max: %d)\n",
840
589
                af->getlinenum(), i, DEFAULTFLAGS - 1);
841
589
            i = 0;
842
589
    }
843
3.13k
          result.push_back((unsigned short)i);
844
3.13k
          if (result.back() == 0)
845
1.30k
            HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
846
1.30k
                             af->getlinenum());
847
3.13k
          src = p + 1;
848
3.13k
        }
849
42.3k
      }
850
5.69k
      int i = atoi(src);
851
5.69k
      if (i >= DEFAULTFLAGS) {
852
394
        HUNSPELL_WARNING(stderr,
853
394
                         "error: line %d: flag id %d is too large (max: %d)\n",
854
394
                         af->getlinenum(), i, DEFAULTFLAGS - 1);
855
394
        i = 0;
856
394
      }
857
5.69k
      result.push_back((unsigned short)i);
858
5.69k
      if (result.back() == 0)
859
4.50k
        HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
860
4.50k
                         af->getlinenum());
861
5.69k
      break;
862
0
    }
863
1.80k
    case FLAG_UNI: {  // UTF-8 characters
864
1.80k
      std::vector<w_char> w;
865
1.80k
      u8_u16(w, flags);
866
1.80k
      size_t len = w.size(), origsize = result.size();
867
#if defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
868
      result.resize(origsize + len);
869
      memcpy(result.data() + origsize, w.data(), len * sizeof(short));
870
#else
871
1.80k
      result.reserve(origsize + len);
872
2.33k
      for (const w_char wc : w) result.push_back((unsigned short)wc);
873
1.80k
#endif
874
1.80k
      break;
875
0
    }
876
36.9k
    default: {  // Ispell's one-character flags (erfg -> e r f g)
877
36.9k
      result.reserve(flags.size());
878
654k
      for (const char flag : flags) {
879
654k
        result.push_back((unsigned char)flag);
880
654k
      }
881
36.9k
    }
882
68.6k
  }
883
68.6k
  return true;
884
68.6k
}
885
886
95.5k
unsigned short HashMgr::decode_flag(const std::string& f) const {
887
95.5k
  unsigned short s = 0;
888
95.5k
  int i;
889
95.5k
  switch (flag_mode) {
890
5.64k
    case FLAG_LONG:
891
5.64k
      if (f.size() >= 2)
892
2.56k
        s = ((unsigned short)((unsigned char)f[0]) << 8) | ((unsigned short)((unsigned char)f[1]));
893
5.64k
      break;
894
1.99k
    case FLAG_NUM:
895
1.99k
      i = atoi(f.c_str());
896
1.99k
      if (i >= DEFAULTFLAGS) {
897
96
        HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
898
96
                         i, DEFAULTFLAGS - 1);
899
96
        i = 0;
900
96
      }
901
1.99k
      s = (unsigned short)i;
902
1.99k
      break;
903
1.99k
    case FLAG_UNI: {
904
1.99k
      std::vector<w_char> w;
905
1.99k
      u8_u16(w, f);
906
1.99k
      if (!w.empty())
907
1.89k
        s = (unsigned short)w[0];
908
1.99k
      break;
909
0
    }
910
85.9k
    default:
911
85.9k
      if (!f.empty())
912
83.2k
        s = (unsigned char)f[0];
913
95.5k
  }
914
95.5k
  if (s == 0)
915
12.2k
    HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
916
95.5k
  return s;
917
95.5k
}
918
919
1.13k
std::string HashMgr::encode_flag(unsigned short f) const {
920
1.13k
  if (f == 0)
921
245
    return "(NULL)";
922
889
  std::string ch;
923
889
  if (flag_mode == FLAG_LONG) {
924
32
    ch.push_back((unsigned char)(f >> 8));
925
32
    ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
926
857
  } else if (flag_mode == FLAG_NUM) {
927
11
    ch = std::to_string(f); 
928
846
  } else if (flag_mode == FLAG_UNI) {
929
930
#if defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
931
932
#if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
933
    auto wc = std::bit_cast<w_char>(f);
934
#else
935
    w_char wc;
936
    memcpy(&wc, &f, sizeof(unsigned short));
937
#endif
938
939
#else
940
27
    w_char wc;
941
27
    wc.h = (unsigned char)(f >> 8);
942
27
    wc.l = (unsigned char)(f & 0xff);
943
27
#endif
944
27
    const std::vector<w_char> w = { wc };
945
27
    u16_u8(ch, w);
946
819
  } else {
947
819
    ch.push_back((unsigned char)(f));
948
819
  }
949
889
  return ch;
950
1.13k
}
951
952
// read in aff file and set flag mode
953
15.2k
int HashMgr::load_config(const char* affpath, const char* key) {
954
15.2k
  int firstline = 1;
955
956
  // open the affix file
957
15.2k
  FileMgr* afflst = new FileMgr(affpath, key);
958
15.2k
  if (!afflst) {
959
0
    HUNSPELL_WARNING(
960
0
        stderr, "Error - could not open affix description file %s\n", affpath);
961
0
    return 1;
962
0
  }
963
964
  // read in each line ignoring any that do not
965
  // start with a known line type indicator
966
967
15.2k
  std::string line;
968
639k
  while (afflst->getline(line)) {
969
626k
    mychomp(line);
970
971
    /* remove byte order mark */
972
626k
    if (firstline) {
973
14.8k
      firstline = 0;
974
14.8k
      if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
975
1
        line.erase(0, 3);
976
1
      }
977
14.8k
    }
978
979
    /* parse in the try string */
980
626k
    if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
981
5.46k
      if (flag_mode != FLAG_CHAR) {
982
2.51k
        HUNSPELL_WARNING(stderr,
983
2.51k
                         "error: line %d: multiple definitions of the FLAG "
984
2.51k
                         "affix file parameter\n",
985
2.51k
                         afflst->getlinenum());
986
2.51k
      }
987
5.46k
      if (line.find("long") != std::string::npos)
988
1.02k
        flag_mode = FLAG_LONG;
989
5.46k
      if (line.find("num") != std::string::npos)
990
1.32k
        flag_mode = FLAG_NUM;
991
5.46k
      if (line.find("UTF-8") != std::string::npos)
992
804
        flag_mode = FLAG_UNI;
993
5.46k
      if (flag_mode == FLAG_CHAR) {
994
1.80k
        HUNSPELL_WARNING(
995
1.80k
            stderr,
996
1.80k
            "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
997
1.80k
            afflst->getlinenum());
998
1.80k
      }
999
5.46k
    }
1000
1001
626k
    if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
1002
1.47k
      std::string st;
1003
1.47k
      if (!parse_string(line, st, afflst->getlinenum())) {
1004
5
        delete afflst;
1005
5
        return 1;
1006
5
      }
1007
1.47k
      forbiddenword = decode_flag(st);
1008
1.47k
    }
1009
1010
626k
    if (line.compare(0, 3, "SET", 3) == 0) {
1011
4.23k
      if (!parse_string(line, enc, afflst->getlinenum())) {
1012
141
        delete afflst;
1013
141
        return 1;
1014
141
      }
1015
4.09k
      if (enc == "UTF-8") {
1016
3.45k
        utf8 = 1;
1017
3.45k
      } else
1018
642
        csconv = get_current_cs(enc);
1019
4.09k
    }
1020
1021
625k
    if (line.compare(0, 4, "LANG", 4) == 0) {
1022
1.38k
      if (!parse_string(line, lang, afflst->getlinenum())) {
1023
35
        delete afflst;
1024
35
        return 1;
1025
35
      }
1026
1.34k
      langnum = get_lang_num(lang);
1027
1.34k
    }
1028
1029
    /* parse in the ignored characters (for example, Arabic optional diacritics
1030
     * characters */
1031
625k
    if (line.compare(0, 6, "IGNORE", 6) == 0) {
1032
1.31k
      if (!parse_array(line, ignorechars, ignorechars_utf16,
1033
1.31k
                       utf8, afflst->getlinenum())) {
1034
53
        delete afflst;
1035
53
        return 1;
1036
53
      }
1037
1.31k
    }
1038
1039
625k
    if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
1040
837
      if (!parse_aliasf(line, afflst)) {
1041
650
        delete afflst;
1042
650
        return 1;
1043
650
      }
1044
837
    }
1045
1046
625k
    if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
1047
687
      if (!parse_aliasm(line, afflst)) {
1048
377
        delete afflst;
1049
377
        return 1;
1050
377
      }
1051
687
    }
1052
1053
624k
    if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
1054
4.60k
      complexprefixes = 1;
1055
1056
    /* parse in the typical fault correcting table */
1057
624k
    if (line.compare(0, 3, "REP", 3) == 0) {
1058
1.10k
      if (!parse_reptable(line, afflst)) {
1059
1.08k
        delete afflst;
1060
1.08k
        return 1;
1061
1.08k
      }
1062
1.10k
    }
1063
1064
    // don't check the full affix file, yet
1065
623k
    if (((line.compare(0, 3, "SFX", 3) == 0) ||
1066
602k
         (line.compare(0, 3, "PFX", 3) == 0)) &&
1067
33.6k
            line.size() > 3 && isspace(line[3]) &&
1068
15.4k
            !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
1069
2
      break;
1070
623k
  }
1071
1072
12.8k
  delete afflst;
1073
12.8k
  return 0;
1074
15.2k
}
1075
1076
/* parse in the ALIAS table */
1077
837
bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
1078
837
  if (!aliasf.empty()) {
1079
62
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1080
62
                     af->getlinenum());
1081
62
    return false;
1082
62
  }
1083
775
  int i = 0, np = 0, numaliasf = 0;
1084
775
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
1085
2.85k
  while (start_piece != line.end()) {
1086
2.19k
    switch (i) {
1087
775
      case 0: {
1088
775
        np++;
1089
775
        break;
1090
0
      }
1091
766
      case 1: {
1092
766
        numaliasf = atoi(std::string(start_piece, iter).c_str());
1093
766
        if (numaliasf < 1) {
1094
116
          aliasf.clear();
1095
116
          aliasflen.clear();
1096
116
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1097
116
                           af->getlinenum());
1098
116
          return false;
1099
116
        }
1100
650
        aliasf.reserve(std::min(numaliasf, 16384));
1101
650
        aliasflen.reserve(std::min(numaliasf, 16384));
1102
650
        np++;
1103
650
        break;
1104
766
      }
1105
651
      default:
1106
651
        break;
1107
2.19k
    }
1108
2.07k
    ++i;
1109
2.07k
    start_piece = mystrsep(line, iter);
1110
2.07k
  }
1111
659
  if (np != 2) {
1112
9
    aliasf.clear();
1113
9
    aliasflen.clear();
1114
9
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1115
9
                     af->getlinenum());
1116
9
    return false;
1117
9
  }
1118
1119
  /* now parse the numaliasf lines to read in the remainder of the table */
1120
5.55k
  for (int j = 0; j < numaliasf; ++j) {
1121
5.36k
    std::string nl;
1122
5.36k
    unsigned short* alias = NULL;
1123
5.36k
    unsigned aliaslen = 0;
1124
5.36k
    i = 0;
1125
5.36k
    if (af->getline(nl)) {
1126
5.16k
      mychomp(nl);
1127
5.16k
      iter = nl.begin();
1128
5.16k
      start_piece = mystrsep(nl, iter);
1129
5.16k
      bool errored = false;
1130
22.3k
      while (!errored && start_piece != nl.end()) {
1131
17.2k
        switch (i) {
1132
5.16k
          case 0: {
1133
5.16k
            if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
1134
239
              errored = true;
1135
239
              break;
1136
239
            }
1137
4.92k
            break;
1138
5.16k
          }
1139
4.92k
          case 1: {
1140
4.90k
            std::string piece(start_piece, iter);
1141
4.90k
            aliaslen =
1142
4.90k
                (unsigned short)decode_flags(&alias, piece, af);
1143
4.90k
            std::sort(alias, alias + aliaslen);
1144
4.90k
            break;
1145
5.16k
          }
1146
7.14k
          default:
1147
7.14k
            break;
1148
17.2k
        }
1149
17.2k
        ++i;
1150
17.2k
        start_piece = mystrsep(nl, iter);
1151
17.2k
      }
1152
5.16k
    }
1153
5.36k
    if (!alias) {
1154
4.86k
      for (int k = 0; k < j; ++k) {
1155
4.39k
        delete[] aliasf[k];
1156
4.39k
      }
1157
463
      aliasf.clear();
1158
463
      aliasflen.clear();
1159
463
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1160
463
                       af->getlinenum());
1161
463
      return false;
1162
463
    }
1163
1164
4.90k
    aliasf.push_back(alias);
1165
4.90k
    aliasflen.push_back(aliaslen);
1166
4.90k
  }
1167
187
  return true;
1168
650
}
1169
1170
43.1k
int HashMgr::is_aliasf() const {
1171
43.1k
  return !aliasf.empty();
1172
43.1k
}
1173
1174
5.67k
int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
1175
5.67k
  if (index > 0 && static_cast<size_t>(index) <= aliasflen.size()) {
1176
1.79k
    *fvec = aliasf[index - 1];
1177
1.79k
    return aliasflen[index - 1];
1178
1.79k
  }
1179
3.88k
  HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
1180
3.88k
                   af->getlinenum(), index);
1181
3.88k
  *fvec = NULL;
1182
3.88k
  return 0;
1183
5.67k
}
1184
1185
/* parse morph alias definitions */
1186
687
bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
1187
687
  if (!aliasm.empty()) {
1188
64
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1189
64
                     af->getlinenum());
1190
64
    return false;
1191
64
  }
1192
623
  int i = 0, np = 0, numaliasm = 0;
1193
623
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
1194
2.62k
  while (start_piece != line.end()) {
1195
2.04k
    switch (i) {
1196
623
      case 0: {
1197
623
        np++;
1198
623
        break;
1199
0
      }
1200
604
      case 1: {
1201
604
        numaliasm = atoi(std::string(start_piece, iter).c_str());
1202
604
        if (numaliasm < 1) {
1203
42
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1204
42
                           af->getlinenum());
1205
42
          return false;
1206
42
        }
1207
562
        aliasm.reserve(std::min(numaliasm, 16384));
1208
562
        np++;
1209
562
        break;
1210
604
      }
1211
816
      default:
1212
816
        break;
1213
2.04k
    }
1214
2.00k
    ++i;
1215
2.00k
    start_piece = mystrsep(line, iter);
1216
2.00k
  }
1217
581
  if (np != 2) {
1218
19
    aliasm.clear();
1219
19
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1220
19
                     af->getlinenum());
1221
19
    return false;
1222
19
  }
1223
1224
  /* now parse the numaliasm lines to read in the remainder of the table */
1225
2.46k
  for (int j = 0; j < numaliasm; ++j) {
1226
2.15k
    std::string nl;
1227
2.15k
    char* alias = NULL;
1228
2.15k
    if (af->getline(nl)) {
1229
2.07k
      mychomp(nl);
1230
2.07k
      iter = nl.begin();
1231
2.07k
      i = 0;
1232
2.07k
      start_piece = mystrsep(nl, iter);
1233
2.07k
      bool errored = false;
1234
9.00k
      while (!errored && start_piece != nl.end()) {
1235
6.92k
        switch (i) {
1236
2.06k
          case 0: {
1237
2.06k
            if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
1238
144
              errored = true;
1239
144
              break;
1240
144
            }
1241
1.92k
            break;
1242
2.06k
          }
1243
1.92k
          case 1: {
1244
            // add the remaining of the line
1245
1.90k
            std::string::const_iterator end = nl.end();
1246
1.90k
            std::string chunk(start_piece, end);
1247
1.90k
            if (complexprefixes) {
1248
763
              if (utf8)
1249
264
                reverseword_utf(chunk);
1250
499
              else
1251
499
                reverseword(chunk);
1252
763
            }
1253
1.90k
            size_t sl = chunk.size() + 1;
1254
1.90k
            alias = new char[sl];
1255
1.90k
            memcpy(alias, chunk.c_str(), sl);
1256
1.90k
            break;
1257
2.06k
          }
1258
2.95k
          default:
1259
2.95k
            break;
1260
6.92k
        }
1261
6.92k
        ++i;
1262
6.92k
        start_piece = mystrsep(nl, iter);
1263
6.92k
      }
1264
2.07k
    }
1265
2.15k
    if (!alias) {
1266
1.42k
      for (int k = 0; k < j; ++k) {
1267
1.17k
        delete[] aliasm[k];
1268
1.17k
      }
1269
252
      aliasm.clear();
1270
252
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1271
252
                       af->getlinenum());
1272
252
      return false;
1273
252
    }
1274
1.90k
    aliasm.push_back(alias);
1275
1.90k
  }
1276
310
  return true;
1277
562
}
1278
1279
44.1k
int HashMgr::is_aliasm() const {
1280
44.1k
  return !aliasm.empty();
1281
44.1k
}
1282
1283
21.1k
char* HashMgr::get_aliasm(int index) const {
1284
21.1k
  if (index > 0 && static_cast<size_t>(index) <= aliasm.size())
1285
8.27k
    return aliasm[index - 1];
1286
12.8k
  HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1287
12.8k
  return NULL;
1288
21.1k
}
1289
1290
/* parse in the typical fault correcting table */
1291
1.10k
bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
1292
1.10k
  if (!reptable.empty()) {
1293
6
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1294
6
                     af->getlinenum());
1295
6
    return false;
1296
6
  }
1297
1.09k
  int numrep = -1, i = 0, np = 0;
1298
1.09k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
1299
5.96k
  while (start_piece != line.end()) {
1300
4.95k
    switch (i) {
1301
1.09k
      case 0: {
1302
1.09k
        np++;
1303
1.09k
        break;
1304
0
      }
1305
930
      case 1: {
1306
930
        numrep = atoi(std::string(start_piece, iter).c_str());
1307
930
        if (numrep < 1) {
1308
91
          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
1309
91
                           af->getlinenum());
1310
91
          return false;
1311
91
        }
1312
839
        reptable.reserve(std::min(numrep, 16384));
1313
839
        np++;
1314
839
        break;
1315
930
      }
1316
2.92k
      default:
1317
2.92k
        break;
1318
4.95k
    }
1319
4.86k
    ++i;
1320
4.86k
    start_piece = mystrsep(line, iter);
1321
4.86k
  }
1322
1.00k
  if (np != 2) {
1323
169
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1324
169
                     af->getlinenum());
1325
169
    return false;
1326
169
  }
1327
1328
  /* now parse the numrep lines to read in the remainder of the table */
1329
4.36k
  for (int j = 0; j < numrep; ++j) {
1330
4.34k
    std::string nl;
1331
4.34k
    reptable.emplace_back();
1332
4.34k
    int type = 0;
1333
4.34k
    if (af->getline(nl)) {
1334
4.16k
      mychomp(nl);
1335
4.16k
      iter = nl.begin();
1336
4.16k
      i = 0;
1337
4.16k
      start_piece = mystrsep(nl, iter);
1338
4.16k
      bool errored = false;
1339
18.0k
      while (!errored && start_piece != nl.end()) {
1340
13.8k
        switch (i) {
1341
4.08k
          case 0: {
1342
4.08k
            if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
1343
481
              errored = true;
1344
481
              break;
1345
481
            }
1346
3.59k
            break;
1347
4.08k
          }
1348
3.59k
          case 1: {
1349
3.57k
            if (*start_piece == '^')
1350
80
              type = 1;
1351
3.57k
            reptable.back().pattern.assign(start_piece + type, iter);
1352
3.57k
            mystrrep(reptable.back().pattern, "_", " ");
1353
3.57k
            if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
1354
2.64k
              type += 2;
1355
2.64k
              reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
1356
2.64k
            }
1357
3.57k
            break;
1358
4.08k
          }
1359
3.53k
          case 2: {
1360
3.53k
            reptable.back().outstrings[type].assign(start_piece, iter);
1361
3.53k
            mystrrep(reptable.back().outstrings[type], "_", " ");
1362
3.53k
            break;
1363
4.08k
          }
1364
2.66k
          default:
1365
2.66k
            break;
1366
13.8k
        }
1367
13.8k
        ++i;
1368
13.8k
        start_piece = mystrsep(nl, iter);
1369
13.8k
      }
1370
4.16k
    }
1371
4.34k
    if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
1372
815
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1373
815
                       af->getlinenum());
1374
815
      reptable.clear();
1375
815
      return false;
1376
815
    }
1377
4.34k
  }
1378
24
  return true;
1379
839
}
1380
1381
// return replacing table
1382
3.02M
const std::vector<replentry>& HashMgr::get_reptable() const {
1383
3.02M
  return reptable;
1384
3.02M
}