Coverage Report

Created: 2025-05-08 07:17

/src/hunspell/src/hunspell/affixmgr.cxx
Line
Count
Source (jump to first uncovered line)
1
/* ***** BEGIN LICENSE BLOCK *****
2
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3
 *
4
 * Copyright (C) 2002-2022 Németh László
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version
7
 * 1.1 (the "License"); you may not use this file except in compliance with
8
 * the License. You may obtain a copy of the License at
9
 * http://www.mozilla.org/MPL/
10
 *
11
 * Software distributed under the License is distributed on an "AS IS" basis,
12
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13
 * for the specific language governing rights and limitations under the
14
 * License.
15
 *
16
 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17
 *
18
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23
 *
24
 * Alternatively, the contents of this file may be used under the terms of
25
 * either the GNU General Public License Version 2 or later (the "GPL"), or
26
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
 * in which case the provisions of the GPL or the LGPL are applicable instead
28
 * of those above. If you wish to allow use of your version of this file only
29
 * under the terms of either the GPL or the LGPL, and not to allow others to
30
 * use your version of this file under the terms of the MPL, indicate your
31
 * decision by deleting the provisions above and replace them with the notice
32
 * and other provisions required by the GPL or the LGPL. If you do not delete
33
 * the provisions above, a recipient may use your version of this file under
34
 * the terms of any one of the MPL, the GPL or the LGPL.
35
 *
36
 * ***** END LICENSE BLOCK ***** */
37
/*
38
 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39
 * And Contributors.  All rights reserved.
40
 *
41
 * Redistribution and use in source and binary forms, with or without
42
 * modification, are permitted provided that the following conditions
43
 * are met:
44
 *
45
 * 1. Redistributions of source code must retain the above copyright
46
 *    notice, this list of conditions and the following disclaimer.
47
 *
48
 * 2. Redistributions in binary form must reproduce the above copyright
49
 *    notice, this list of conditions and the following disclaimer in the
50
 *    documentation and/or other materials provided with the distribution.
51
 *
52
 * 3. All modifications to the source code must be clearly marked as
53
 *    such.  Binary redistributions based on modified source code
54
 *    must be clearly marked as modified versions in the documentation
55
 *    and/or other materials provided with the distribution.
56
 *
57
 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61
 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68
 * SUCH DAMAGE.
69
 */
70
71
#include <cstdlib>
72
#include <cstring>
73
#include <cstdio>
74
#include <cctype>
75
#include <ctime>
76
77
#include <algorithm>
78
#include <chrono>
79
#include <memory>
80
#include <limits>
81
#include <string>
82
#include <vector>
83
84
#include "affixmgr.hxx"
85
#include "affentry.hxx"
86
#include "langnum.hxx"
87
88
#include "csutil.hxx"
89
90
AffixMgr::AffixMgr(const char* affpath,
91
                   const std::vector<HashMgr*>& ptr,
92
                   const char* key)
93
18.3k
  : alldic(ptr)
94
18.3k
  , pHMgr(ptr[0]) {
95
96
  // register hash manager and load affix data from aff file
97
18.3k
  csconv = NULL;
98
18.3k
  utf8 = 0;
99
18.3k
  complexprefixes = 0;
100
18.3k
  parsedmaptable = false;
101
18.3k
  parsedbreaktable = false;
102
18.3k
  iconvtable = NULL;
103
18.3k
  oconvtable = NULL;
104
  // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
105
18.3k
  simplifiedcpd = 0;
106
18.3k
  parsedcheckcpd = false;
107
18.3k
  parseddefcpd = false;
108
18.3k
  phone = NULL;
109
18.3k
  compoundflag = FLAG_NULL;        // permits word in compound forms
110
18.3k
  compoundbegin = FLAG_NULL;       // may be first word in compound forms
111
18.3k
  compoundmiddle = FLAG_NULL;      // may be middle word in compound forms
112
18.3k
  compoundend = FLAG_NULL;         // may be last word in compound forms
113
18.3k
  compoundroot = FLAG_NULL;        // compound word signing flag
114
18.3k
  compoundpermitflag = FLAG_NULL;  // compound permitting flag for suffixed word
115
18.3k
  compoundforbidflag = FLAG_NULL;  // compound fordidden flag for suffixed word
116
18.3k
  compoundmoresuffixes = 0;        // allow more suffixes within compound words
117
18.3k
  checkcompounddup = 0;            // forbid double words in compounds
118
18.3k
  checkcompoundrep = 0;  // forbid bad compounds (may be non-compound word with
119
                         // a REP substitution)
120
18.3k
  checkcompoundcase =
121
18.3k
      0;  // forbid upper and lowercase combinations at word bounds
122
18.3k
  checkcompoundtriple = 0;  // forbid compounds with triple letters
123
18.3k
  simplifiedtriple = 0;     // allow simplified triple letters in compounds
124
                            // (Schiff+fahrt -> Schiffahrt)
125
18.3k
  forbiddenword = FORBIDDENWORD;  // forbidden word signing flag
126
18.3k
  nosuggest = FLAG_NULL;  // don't suggest words signed with NOSUGGEST flag
127
18.3k
  nongramsuggest = FLAG_NULL;
128
18.3k
  langnum = 0;  // language code (see http://l10n.openoffice.org/languages.html)
129
18.3k
  needaffix = FLAG_NULL;  // forbidden root, allowed only with suffixes
130
18.3k
  cpdwordmax = -1;        // default: unlimited wordcount in compound words
131
18.3k
  cpdmin = -1;            // undefined
132
18.3k
  cpdmaxsyllable = 0;     // default: unlimited syllablecount in compound words
133
18.3k
  pfxappnd = NULL;  // previous prefix for counting syllables of the prefix BUG
134
18.3k
  sfxappnd = NULL;  // previous suffix for counting syllables of the suffix BUG
135
18.3k
  sfxextra = 0;     // modifier for syllable count of sfxappnd BUG
136
18.3k
  checknum = 0;               // checking numbers, and word with numbers
137
18.3k
  havecontclass = 0;  // flags of possible continuing classes (double affix)
138
  // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
139
  // in morhological description in dictionary file. It's often combined with
140
  // PSEUDOROOT.
141
18.3k
  lemma_present = FLAG_NULL;
142
18.3k
  circumfix = FLAG_NULL;
143
18.3k
  onlyincompound = FLAG_NULL;
144
18.3k
  maxngramsugs = -1;  // undefined
145
18.3k
  maxdiff = -1;       // undefined
146
18.3k
  onlymaxdiff = 0;
147
18.3k
  maxcpdsugs = -1;  // undefined
148
18.3k
  nosplitsugs = 0;
149
18.3k
  sugswithdots = 0;
150
18.3k
  keepcase = 0;
151
18.3k
  forceucase = 0;
152
18.3k
  warn = 0;
153
18.3k
  forbidwarn = 0;
154
18.3k
  checksharps = 0;
155
18.3k
  substandard = FLAG_NULL;
156
18.3k
  fullstrip = 0;
157
158
18.3k
  sfx = NULL;
159
18.3k
  pfx = NULL;
160
161
4.71M
  for (int i = 0; i < SETSIZE; i++) {
162
4.69M
    pStart[i] = NULL;
163
4.69M
    sStart[i] = NULL;
164
4.69M
    pFlag[i] = NULL;
165
4.69M
    sFlag[i] = NULL;
166
4.69M
  }
167
168
18.3k
  memset(contclasses, 0, CONTSIZE * sizeof(char));
169
170
18.3k
  if (parse_file(affpath, key)) {
171
6.30k
    HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
172
6.30k
  }
173
174
  /* get encoding for CHECKCOMPOUNDCASE */
175
18.3k
  if (!utf8) {
176
13.9k
    csconv = get_current_cs(get_encoding());
177
3.58M
    for (int i = 0; i <= 255; i++) {
178
3.57M
      if ((csconv[i].cupper != csconv[i].clower) &&
179
3.57M
          (wordchars.find((char)i) == std::string::npos)) {
180
1.56M
        wordchars.push_back((char)i);
181
1.56M
      }
182
3.57M
    }
183
13.9k
  }
184
185
  // default BREAK definition
186
18.3k
  if (!parsedbreaktable) {
187
16.4k
    breaktable.emplace_back("-");
188
16.4k
    breaktable.emplace_back("^-");
189
16.4k
    breaktable.emplace_back("-$");
190
16.4k
    parsedbreaktable = true;
191
16.4k
  }
192
193
18.3k
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
194
  // not entirely sure this is invalid, so only for fuzzing for now
195
18.3k
  if (iconvtable && !iconvtable->check_against_breaktable(breaktable)) {
196
124
      delete iconvtable;
197
124
      iconvtable = nullptr;
198
124
  }
199
18.3k
#endif
200
201
18.3k
  if (cpdmin == -1)
202
17.7k
    cpdmin = MINCPDLEN;
203
18.3k
}
204
205
18.3k
AffixMgr::~AffixMgr() {
206
  // pass through linked prefix entries and clean up
207
4.71M
  for (int i = 0; i < SETSIZE; i++) {
208
4.69M
    pFlag[i] = NULL;
209
4.69M
    PfxEntry* ptr = pStart[i];
210
4.69M
    PfxEntry* nptr = NULL;
211
4.71M
    while (ptr) {
212
18.5k
      nptr = ptr->getNext();
213
18.5k
      delete (ptr);
214
18.5k
      ptr = nptr;
215
18.5k
      nptr = NULL;
216
18.5k
    }
217
4.69M
  }
218
219
  // pass through linked suffix entries and clean up
220
4.71M
  for (int j = 0; j < SETSIZE; j++) {
221
4.69M
    sFlag[j] = NULL;
222
4.69M
    SfxEntry* ptr = sStart[j];
223
4.69M
    SfxEntry* nptr = NULL;
224
4.72M
    while (ptr) {
225
27.1k
      nptr = ptr->getNext();
226
27.1k
      delete (ptr);
227
27.1k
      ptr = nptr;
228
27.1k
      nptr = NULL;
229
27.1k
    }
230
4.69M
    sStart[j] = NULL;
231
4.69M
  }
232
233
18.3k
  delete iconvtable;
234
18.3k
  delete oconvtable;
235
18.3k
  delete phone;
236
237
18.3k
  FREE_FLAG(compoundflag);
238
18.3k
  FREE_FLAG(compoundbegin);
239
18.3k
  FREE_FLAG(compoundmiddle);
240
18.3k
  FREE_FLAG(compoundend);
241
18.3k
  FREE_FLAG(compoundpermitflag);
242
18.3k
  FREE_FLAG(compoundforbidflag);
243
18.3k
  FREE_FLAG(compoundroot);
244
18.3k
  FREE_FLAG(forbiddenword);
245
18.3k
  FREE_FLAG(nosuggest);
246
18.3k
  FREE_FLAG(nongramsuggest);
247
18.3k
  FREE_FLAG(needaffix);
248
18.3k
  FREE_FLAG(lemma_present);
249
18.3k
  FREE_FLAG(circumfix);
250
18.3k
  FREE_FLAG(onlyincompound);
251
252
18.3k
  cpdwordmax = 0;
253
18.3k
  pHMgr = NULL;
254
18.3k
  cpdmin = 0;
255
18.3k
  cpdmaxsyllable = 0;
256
18.3k
  checknum = 0;
257
#ifdef MOZILLA_CLIENT
258
  delete[] csconv;
259
#endif
260
18.3k
}
261
262
18.3k
void AffixMgr::finishFileMgr(FileMgr* afflst) {
263
18.3k
  delete afflst;
264
265
  // convert affix trees to sorted list
266
18.3k
  process_pfx_tree_to_list();
267
18.3k
  process_sfx_tree_to_list();
268
18.3k
}
269
270
// read in aff file and build up prefix and suffix entry objects
271
18.3k
int AffixMgr::parse_file(const char* affpath, const char* key) {
272
273
  // checking flag duplication
274
18.3k
  char dupflags[CONTSIZE];
275
18.3k
  char dupflags_ini = 1;
276
277
  // first line indicator for removing byte order mark
278
18.3k
  int firstline = 1;
279
280
  // open the affix file
281
18.3k
  FileMgr* afflst = new FileMgr(affpath, key);
282
18.3k
  if (!afflst) {
283
0
    HUNSPELL_WARNING(
284
0
        stderr, "error: could not open affix description file %s\n", affpath);
285
0
    return 1;
286
0
  }
287
288
  // step one is to parse the affix file building up the internal
289
  // affix data structures
290
291
  // read in each line ignoring any that do not
292
  // start with a known line type indicator
293
18.3k
  std::string line;
294
587k
  while (afflst->getline(line)) {
295
575k
    mychomp(line);
296
297
    /* remove byte order mark */
298
575k
    if (firstline) {
299
18.0k
      firstline = 0;
300
      // Affix file begins with byte order mark: possible incompatibility with
301
      // old Hunspell versions
302
18.0k
      if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
303
1
        line.erase(0, 3);
304
1
      }
305
18.0k
    }
306
307
    /* parse in the keyboard string */
308
575k
    if (line.compare(0, 3, "KEY", 3) == 0) {
309
197
      if (!parse_string(line, keystring, afflst->getlinenum())) {
310
6
        finishFileMgr(afflst);
311
6
        return 1;
312
6
      }
313
197
    }
314
315
    /* parse in the try string */
316
575k
    if (line.compare(0, 3, "TRY", 3) == 0) {
317
1.05k
      if (!parse_string(line, trystring, afflst->getlinenum())) {
318
26
        finishFileMgr(afflst);
319
26
        return 1;
320
26
      }
321
1.05k
    }
322
323
    /* parse in the name of the character set used by the .dict and .aff */
324
575k
    if (line.compare(0, 3, "SET", 3) == 0) {
325
5.46k
      if (!parse_string(line, encoding, afflst->getlinenum())) {
326
153
        finishFileMgr(afflst);
327
153
        return 1;
328
153
      }
329
5.30k
      if (encoding == "UTF-8") {
330
4.40k
        utf8 = 1;
331
4.40k
      }
332
5.30k
    }
333
334
    /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
335
     * writing system */
336
575k
    if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
337
5.04k
      complexprefixes = 1;
338
339
    /* parse in the flag used by the controlled compound words */
340
575k
    if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
341
3.30k
      if (!parse_flag(line, &compoundflag, afflst)) {
342
113
        finishFileMgr(afflst);
343
113
        return 1;
344
113
      }
345
3.30k
    }
346
347
    /* parse in the flag used by compound words */
348
575k
    if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
349
1.22k
      if (complexprefixes) {
350
518
        if (!parse_flag(line, &compoundend, afflst)) {
351
8
          finishFileMgr(afflst);
352
8
          return 1;
353
8
        }
354
710
      } else {
355
710
        if (!parse_flag(line, &compoundbegin, afflst)) {
356
3
          finishFileMgr(afflst);
357
3
          return 1;
358
3
        }
359
710
      }
360
1.22k
    }
361
362
    /* parse in the flag used by compound words */
363
575k
    if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
364
317
      if (!parse_flag(line, &compoundmiddle, afflst)) {
365
11
        finishFileMgr(afflst);
366
11
        return 1;
367
11
      }
368
317
    }
369
370
    /* parse in the flag used by compound words */
371
575k
    if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
372
950
      if (complexprefixes) {
373
475
        if (!parse_flag(line, &compoundbegin, afflst)) {
374
18
          finishFileMgr(afflst);
375
18
          return 1;
376
18
        }
377
475
      } else {
378
475
        if (!parse_flag(line, &compoundend, afflst)) {
379
10
          finishFileMgr(afflst);
380
10
          return 1;
381
10
        }
382
475
      }
383
950
    }
384
385
    /* parse in the data used by compound_check() method */
386
575k
    if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
387
280
      if (!parse_num(line, &cpdwordmax, afflst)) {
388
9
        finishFileMgr(afflst);
389
9
        return 1;
390
9
      }
391
280
    }
392
393
    /* parse in the flag sign compounds in dictionary */
394
574k
    if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
395
407
      if (!parse_flag(line, &compoundroot, afflst)) {
396
5
        finishFileMgr(afflst);
397
5
        return 1;
398
5
      }
399
407
    }
400
401
    /* parse in the flag used by compound_check() method */
402
574k
    if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
403
446
      if (!parse_flag(line, &compoundpermitflag, afflst)) {
404
8
        finishFileMgr(afflst);
405
8
        return 1;
406
8
      }
407
446
    }
408
409
    /* parse in the flag used by compound_check() method */
410
574k
    if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
411
376
      if (!parse_flag(line, &compoundforbidflag, afflst)) {
412
5
        finishFileMgr(afflst);
413
5
        return 1;
414
5
      }
415
376
    }
416
417
574k
    if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
418
2.20k
      compoundmoresuffixes = 1;
419
2.20k
    }
420
421
574k
    if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
422
510
      checkcompounddup = 1;
423
510
    }
424
425
574k
    if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
426
419
      checkcompoundrep = 1;
427
419
    }
428
429
574k
    if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
430
168
      checkcompoundtriple = 1;
431
168
    }
432
433
574k
    if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
434
196
      simplifiedtriple = 1;
435
196
    }
436
437
574k
    if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
438
461
      checkcompoundcase = 1;
439
461
    }
440
441
574k
    if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
442
260
      if (!parse_flag(line, &nosuggest, afflst)) {
443
13
        finishFileMgr(afflst);
444
13
        return 1;
445
13
      }
446
260
    }
447
448
574k
    if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
449
380
      if (!parse_flag(line, &nongramsuggest, afflst)) {
450
10
        finishFileMgr(afflst);
451
10
        return 1;
452
10
      }
453
380
    }
454
455
    /* parse in the flag used by forbidden words */
456
574k
    if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
457
2.08k
      if (!parse_flag(line, &forbiddenword, afflst)) {
458
41
        finishFileMgr(afflst);
459
41
        return 1;
460
41
      }
461
2.08k
    }
462
463
    /* parse in the flag used by forbidden words (is deprecated) */
464
574k
    if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
465
79
      if (!parse_flag(line, &lemma_present, afflst)) {
466
2
        finishFileMgr(afflst);
467
2
        return 1;
468
2
      }
469
79
    }
470
471
    /* parse in the flag used by circumfixes */
472
574k
    if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
473
585
      if (!parse_flag(line, &circumfix, afflst)) {
474
13
        finishFileMgr(afflst);
475
13
        return 1;
476
13
      }
477
585
    }
478
479
    /* parse in the flag used by fogemorphemes */
480
574k
    if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
481
622
      if (!parse_flag(line, &onlyincompound, afflst)) {
482
25
        finishFileMgr(afflst);
483
25
        return 1;
484
25
      }
485
622
    }
486
487
    /* parse in the flag used by `needaffixs' (is deprecated) */
488
574k
    if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
489
172
      if (!parse_flag(line, &needaffix, afflst)) {
490
13
        finishFileMgr(afflst);
491
13
        return 1;
492
13
      }
493
172
    }
494
495
    /* parse in the flag used by `needaffixs' */
496
574k
    if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
497
848
      if (!parse_flag(line, &needaffix, afflst)) {
498
23
        finishFileMgr(afflst);
499
23
        return 1;
500
23
      }
501
848
    }
502
503
    /* parse in the minimal length for words in compounds */
504
574k
    if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
505
673
      if (!parse_num(line, &cpdmin, afflst)) {
506
27
        finishFileMgr(afflst);
507
27
        return 1;
508
27
      }
509
646
      if (cpdmin < 1)
510
556
        cpdmin = 1;
511
646
    }
512
513
    /* parse in the max. words and syllables in compounds */
514
574k
    if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
515
51.4k
      if (!parse_cpdsyllable(line, afflst)) {
516
7
        finishFileMgr(afflst);
517
7
        return 1;
518
7
      }
519
51.4k
    }
520
521
    /* parse in the flag used by compound_check() method */
522
574k
    if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
523
49
      if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
524
20
        finishFileMgr(afflst);
525
20
        return 1;
526
20
      }
527
49
    }
528
529
    /* parse in the flag used by the controlled compound words */
530
574k
    if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
531
177
      checknum = 1;
532
177
    }
533
534
    /* parse in the extra word characters */
535
574k
    if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
536
280
      if (!parse_array(line, wordchars, wordchars_utf16,
537
280
                       utf8, afflst->getlinenum())) {
538
9
        finishFileMgr(afflst);
539
9
        return 1;
540
9
      }
541
280
    }
542
543
    /* parse in the ignored characters (for example, Arabic optional diacretics
544
     * charachters */
545
574k
    if (line.compare(0, 6, "IGNORE", 6) == 0) {
546
1.43k
      if (!parse_array(line, ignorechars, ignorechars_utf16,
547
1.43k
                       utf8, afflst->getlinenum())) {
548
35
        finishFileMgr(afflst);
549
35
        return 1;
550
35
      }
551
1.43k
    }
552
553
    /* parse in the input conversion table */
554
574k
    if (line.compare(0, 5, "ICONV", 5) == 0) {
555
2.38k
      if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
556
881
        finishFileMgr(afflst);
557
881
        return 1;
558
881
      }
559
2.38k
    }
560
561
    /* parse in the output conversion table */
562
573k
    if (line.compare(0, 5, "OCONV", 5) == 0) {
563
168
      if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
564
115
        finishFileMgr(afflst);
565
115
        return 1;
566
115
      }
567
168
    }
568
569
    /* parse in the phonetic translation table */
570
573k
    if (line.compare(0, 5, "PHONE", 5) == 0) {
571
1.33k
      if (!parse_phonetable(line, afflst)) {
572
320
        finishFileMgr(afflst);
573
320
        return 1;
574
320
      }
575
1.33k
    }
576
577
    /* parse in the checkcompoundpattern table */
578
573k
    if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
579
869
      if (!parse_checkcpdtable(line, afflst)) {
580
802
        finishFileMgr(afflst);
581
802
        return 1;
582
802
      }
583
869
    }
584
585
    /* parse in the defcompound table */
586
572k
    if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
587
933
      if (!parse_defcpdtable(line, afflst)) {
588
863
        finishFileMgr(afflst);
589
863
        return 1;
590
863
      }
591
933
    }
592
593
    /* parse in the related character map table */
594
571k
    if (line.compare(0, 3, "MAP", 3) == 0) {
595
731
      if (!parse_maptable(line, afflst)) {
596
418
        finishFileMgr(afflst);
597
418
        return 1;
598
418
      }
599
731
    }
600
601
    /* parse in the word breakpoints table */
602
571k
    if (line.compare(0, 5, "BREAK", 5) == 0) {
603
1.94k
      if (!parse_breaktable(line, afflst)) {
604
411
        finishFileMgr(afflst);
605
411
        return 1;
606
411
      }
607
1.94k
    }
608
609
    /* parse in the language for language specific codes */
610
570k
    if (line.compare(0, 4, "LANG", 4) == 0) {
611
1.62k
      if (!parse_string(line, lang, afflst->getlinenum())) {
612
37
        finishFileMgr(afflst);
613
37
        return 1;
614
37
      }
615
1.59k
      langnum = get_lang_num(lang);
616
1.59k
    }
617
618
570k
    if (line.compare(0, 7, "VERSION", 7) == 0) {
619
3.75k
      size_t startpos = line.find_first_not_of(" \t", 7);
620
3.75k
      if (startpos != std::string::npos) {
621
3.10k
          version = line.substr(startpos);
622
3.10k
      }
623
3.75k
    }
624
625
570k
    if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
626
400
      if (!parse_num(line, &maxngramsugs, afflst)) {
627
17
        finishFileMgr(afflst);
628
17
        return 1;
629
17
      }
630
400
    }
631
632
570k
    if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
633
115
      onlymaxdiff = 1;
634
635
570k
    if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
636
783
      if (!parse_num(line, &maxdiff, afflst)) {
637
11
        finishFileMgr(afflst);
638
11
        return 1;
639
11
      }
640
783
    }
641
642
570k
    if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
643
179
      if (!parse_num(line, &maxcpdsugs, afflst)) {
644
13
        finishFileMgr(afflst);
645
13
        return 1;
646
13
      }
647
179
    }
648
649
570k
    if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
650
2.90k
      nosplitsugs = 1;
651
2.90k
    }
652
653
570k
    if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
654
378
      fullstrip = 1;
655
378
    }
656
657
570k
    if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
658
344
      sugswithdots = 1;
659
344
    }
660
661
    /* parse in the flag used by forbidden words */
662
570k
    if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
663
258
      if (!parse_flag(line, &keepcase, afflst)) {
664
5
        finishFileMgr(afflst);
665
5
        return 1;
666
5
      }
667
258
    }
668
669
    /* parse in the flag used by `forceucase' */
670
570k
    if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
671
397
      if (!parse_flag(line, &forceucase, afflst)) {
672
16
        finishFileMgr(afflst);
673
16
        return 1;
674
16
      }
675
397
    }
676
677
    /* parse in the flag used by `warn' */
678
570k
    if (line.compare(0, 4, "WARN", 4) == 0) {
679
291
      if (!parse_flag(line, &warn, afflst)) {
680
36
        finishFileMgr(afflst);
681
36
        return 1;
682
36
      }
683
291
    }
684
685
570k
    if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
686
94
      forbidwarn = 1;
687
94
    }
688
689
    /* parse in the flag used by the affix generator */
690
570k
    if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
691
107
      if (!parse_flag(line, &substandard, afflst)) {
692
5
        finishFileMgr(afflst);
693
5
        return 1;
694
5
      }
695
107
    }
696
697
570k
    if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
698
1.01k
      checksharps = 1;
699
1.01k
    }
700
701
    /* parse this affix: P - prefix, S - suffix */
702
    // affix type
703
570k
    char ft = ' ';
704
570k
    if (line.compare(0, 3, "PFX", 3) == 0)
705
15.0k
      ft = complexprefixes ? 'S' : 'P';
706
570k
    if (line.compare(0, 3, "SFX", 3) == 0)
707
20.0k
      ft = complexprefixes ? 'P' : 'S';
708
570k
    if (ft != ' ') {
709
35.1k
      if (dupflags_ini) {
710
7.26k
        memset(dupflags, 0, sizeof(dupflags));
711
7.26k
        dupflags_ini = 0;
712
7.26k
      }
713
35.1k
      if (!parse_affix(line, ft, afflst, dupflags)) {
714
1.74k
        finishFileMgr(afflst);
715
1.74k
        return 1;
716
1.74k
      }
717
35.1k
    }
718
570k
  }
719
720
12.0k
  finishFileMgr(afflst);
721
  // affix trees are sorted now
722
723
  // now we can speed up performance greatly taking advantage of the
724
  // relationship between the affixes and the idea of "subsets".
725
726
  // View each prefix as a potential leading subset of another and view
727
  // each suffix (reversed) as a potential trailing subset of another.
728
729
  // To illustrate this relationship if we know the prefix "ab" is found in the
730
  // word to examine, only prefixes that "ab" is a leading subset of need be
731
  // examined.
732
  // Furthermore is "ab" is not present then none of the prefixes that "ab" is
733
  // is a subset need be examined.
734
  // The same argument goes for suffix string that are reversed.
735
736
  // Then to top this off why not examine the first char of the word to quickly
737
  // limit the set of prefixes to examine (i.e. the prefixes to examine must
738
  // be leading supersets of the first character of the word (if they exist)
739
740
  // To take advantage of this "subset" relationship, we need to add two links
741
  // from entry.  One to take next if the current prefix is found (call it
742
  // nexteq)
743
  // and one to take next if the current prefix is not found (call it nextne).
744
745
  // Since we have built ordered lists, all that remains is to properly
746
  // initialize
747
  // the nextne and nexteq pointers that relate them
748
749
12.0k
  process_pfx_order();
750
12.0k
  process_sfx_order();
751
752
12.0k
  return 0;
753
18.3k
}
754
755
// we want to be able to quickly access prefix information
756
// both by prefix flag, and sorted by prefix string itself
757
// so we need to set up two indexes
758
759
18.5k
int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
760
18.5k
  PfxEntry* ptr;
761
18.5k
  PfxEntry* pptr;
762
18.5k
  PfxEntry* ep = pfxptr;
763
764
  // get the right starting points
765
18.5k
  const char* key = ep->getKey();
766
18.5k
  const auto flg = (unsigned char)(ep->getFlag() & 0x00FF);
767
768
  // first index by flag which must exist
769
18.5k
  ptr = pFlag[flg];
770
18.5k
  ep->setFlgNxt(ptr);
771
18.5k
  pFlag[flg] = ep;
772
773
  // handle the special case of null affix string
774
18.5k
  if (*key == '\0') {
775
    // always inset them at head of list at element 0
776
7.02k
    ptr = pStart[0];
777
7.02k
    ep->setNext(ptr);
778
7.02k
    pStart[0] = ep;
779
7.02k
    return 0;
780
7.02k
  }
781
782
  // now handle the normal case
783
11.5k
  ep->setNextEQ(NULL);
784
11.5k
  ep->setNextNE(NULL);
785
786
11.5k
  unsigned char sp = *((const unsigned char*)key);
787
11.5k
  ptr = pStart[sp];
788
789
  // handle the first insert
790
11.5k
  if (!ptr) {
791
2.63k
    pStart[sp] = ep;
792
2.63k
    return 0;
793
2.63k
  }
794
795
  // otherwise use binary tree insertion so that a sorted
796
  // list can easily be generated later
797
8.86k
  pptr = NULL;
798
294k
  for (;;) {
799
294k
    pptr = ptr;
800
294k
    if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
801
290k
      ptr = ptr->getNextEQ();
802
290k
      if (!ptr) {
803
8.14k
        pptr->setNextEQ(ep);
804
8.14k
        break;
805
8.14k
      }
806
290k
    } else {
807
4.70k
      ptr = ptr->getNextNE();
808
4.70k
      if (!ptr) {
809
717
        pptr->setNextNE(ep);
810
717
        break;
811
717
      }
812
4.70k
    }
813
294k
  }
814
8.86k
  return 0;
815
11.5k
}
816
817
// we want to be able to quickly access suffix information
818
// both by suffix flag, and sorted by the reverse of the
819
// suffix string itself; so we need to set up two indexes
820
27.1k
int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
821
822
27.1k
  sfxptr->initReverseWord();
823
824
27.1k
  SfxEntry* ptr;
825
27.1k
  SfxEntry* pptr;
826
27.1k
  SfxEntry* ep = sfxptr;
827
828
  /* get the right starting point */
829
27.1k
  const char* key = ep->getKey();
830
27.1k
  const auto flg = (unsigned char)(ep->getFlag() & 0x00FF);
831
832
  // first index by flag which must exist
833
27.1k
  ptr = sFlag[flg];
834
27.1k
  ep->setFlgNxt(ptr);
835
27.1k
  sFlag[flg] = ep;
836
837
  // next index by affix string
838
839
  // handle the special case of null affix string
840
27.1k
  if (*key == '\0') {
841
    // always inset them at head of list at element 0
842
13.4k
    ptr = sStart[0];
843
13.4k
    ep->setNext(ptr);
844
13.4k
    sStart[0] = ep;
845
13.4k
    return 0;
846
13.4k
  }
847
848
  // now handle the normal case
849
13.7k
  ep->setNextEQ(NULL);
850
13.7k
  ep->setNextNE(NULL);
851
852
13.7k
  unsigned char sp = *((const unsigned char*)key);
853
13.7k
  ptr = sStart[sp];
854
855
  // handle the first insert
856
13.7k
  if (!ptr) {
857
2.87k
    sStart[sp] = ep;
858
2.87k
    return 0;
859
2.87k
  }
860
861
  // otherwise use binary tree insertion so that a sorted
862
  // list can easily be generated later
863
10.8k
  pptr = NULL;
864
275k
  for (;;) {
865
275k
    pptr = ptr;
866
275k
    if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
867
270k
      ptr = ptr->getNextEQ();
868
270k
      if (!ptr) {
869
10.0k
        pptr->setNextEQ(ep);
870
10.0k
        break;
871
10.0k
      }
872
270k
    } else {
873
5.09k
      ptr = ptr->getNextNE();
874
5.09k
      if (!ptr) {
875
825
        pptr->setNextNE(ep);
876
825
        break;
877
825
      }
878
5.09k
    }
879
275k
  }
880
10.8k
  return 0;
881
13.7k
}
882
883
// convert from binary tree to sorted list
884
18.3k
int AffixMgr::process_pfx_tree_to_list() {
885
4.69M
  for (int i = 1; i < SETSIZE; i++) {
886
4.67M
    pStart[i] = process_pfx_in_order(pStart[i], NULL);
887
4.67M
  }
888
18.3k
  return 0;
889
18.3k
}
890
891
4.70M
PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
892
4.70M
  if (ptr) {
893
11.5k
    nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
894
11.5k
    ptr->setNext(nptr);
895
11.5k
    nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
896
11.5k
  }
897
4.70M
  return nptr;
898
4.70M
}
899
900
// convert from binary tree to sorted list
901
18.3k
int AffixMgr::process_sfx_tree_to_list() {
902
4.69M
  for (int i = 1; i < SETSIZE; i++) {
903
4.67M
    sStart[i] = process_sfx_in_order(sStart[i], NULL);
904
4.67M
  }
905
18.3k
  return 0;
906
18.3k
}
907
908
4.70M
SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
909
4.70M
  if (ptr) {
910
13.7k
    nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
911
13.7k
    ptr->setNext(nptr);
912
13.7k
    nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
913
13.7k
  }
914
4.70M
  return nptr;
915
4.70M
}
916
917
// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
918
// using the idea of leading subsets this time
919
12.0k
int AffixMgr::process_pfx_order() {
920
12.0k
  PfxEntry* ptr;
921
922
  // loop through each prefix list starting point
923
3.08M
  for (int i = 1; i < SETSIZE; i++) {
924
3.07M
    ptr = pStart[i];
925
926
    // look through the remainder of the list
927
    //  and find next entry with affix that
928
    // the current one is not a subset of
929
    // mark that as destination for NextNE
930
    // use next in list that you are a subset
931
    // of as NextEQ
932
933
3.07M
    for (; ptr != NULL; ptr = ptr->getNext()) {
934
5.75k
      PfxEntry* nptr = ptr->getNext();
935
184k
      for (; nptr != NULL; nptr = nptr->getNext()) {
936
180k
        if (!isSubset(ptr->getKey(), nptr->getKey()))
937
1.36k
          break;
938
180k
      }
939
5.75k
      ptr->setNextNE(nptr);
940
5.75k
      ptr->setNextEQ(NULL);
941
5.75k
      if ((ptr->getNext()) &&
942
5.75k
          isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
943
3.63k
        ptr->setNextEQ(ptr->getNext());
944
5.75k
    }
945
946
    // now clean up by adding smart search termination strings:
947
    // if you are already a superset of the previous prefix
948
    // but not a subset of the next, search can end here
949
    // so set NextNE properly
950
951
3.07M
    ptr = pStart[i];
952
3.07M
    for (; ptr != NULL; ptr = ptr->getNext()) {
953
5.75k
      PfxEntry* nptr = ptr->getNext();
954
5.75k
      PfxEntry* mptr = NULL;
955
184k
      for (; nptr != NULL; nptr = nptr->getNext()) {
956
180k
        if (!isSubset(ptr->getKey(), nptr->getKey()))
957
1.36k
          break;
958
178k
        mptr = nptr;
959
178k
      }
960
5.75k
      if (mptr)
961
3.63k
        mptr->setNextNE(NULL);
962
5.75k
    }
963
3.07M
  }
964
12.0k
  return 0;
965
12.0k
}
966
967
// initialize the SfxEntry links NextEQ and NextNE to speed searching
968
// using the idea of leading subsets this time
969
12.0k
int AffixMgr::process_sfx_order() {
970
12.0k
  SfxEntry* ptr;
971
972
  // loop through each prefix list starting point
973
3.08M
  for (int i = 1; i < SETSIZE; i++) {
974
3.07M
    ptr = sStart[i];
975
976
    // look through the remainder of the list
977
    //  and find next entry with affix that
978
    // the current one is not a subset of
979
    // mark that as destination for NextNE
980
    // use next in list that you are a subset
981
    // of as NextEQ
982
983
3.07M
    for (; ptr != NULL; ptr = ptr->getNext()) {
984
5.96k
      SfxEntry* nptr = ptr->getNext();
985
59.7k
      for (; nptr != NULL; nptr = nptr->getNext()) {
986
55.2k
        if (!isSubset(ptr->getKey(), nptr->getKey()))
987
1.54k
          break;
988
55.2k
      }
989
5.96k
      ptr->setNextNE(nptr);
990
5.96k
      ptr->setNextEQ(NULL);
991
5.96k
      if ((ptr->getNext()) &&
992
5.96k
          isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
993
3.66k
        ptr->setNextEQ(ptr->getNext());
994
5.96k
    }
995
996
    // now clean up by adding smart search termination strings:
997
    // if you are already a superset of the previous suffix
998
    // but not a subset of the next, search can end here
999
    // so set NextNE properly
1000
1001
3.07M
    ptr = sStart[i];
1002
3.07M
    for (; ptr != NULL; ptr = ptr->getNext()) {
1003
5.96k
      SfxEntry* nptr = ptr->getNext();
1004
5.96k
      SfxEntry* mptr = NULL;
1005
59.7k
      for (; nptr != NULL; nptr = nptr->getNext()) {
1006
55.2k
        if (!isSubset(ptr->getKey(), nptr->getKey()))
1007
1.54k
          break;
1008
53.7k
        mptr = nptr;
1009
53.7k
      }
1010
5.96k
      if (mptr)
1011
3.66k
        mptr->setNextNE(NULL);
1012
5.96k
    }
1013
3.07M
  }
1014
12.0k
  return 0;
1015
12.0k
}
1016
1017
// add flags to the result for dictionary debugging
1018
0
std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1019
0
  std::string st = encode_flag(flag);
1020
0
  result.push_back(MSEP_FLD);
1021
0
  result.append(MORPH_FLAG);
1022
0
  result.append(st);
1023
0
  return result;
1024
0
}
1025
1026
// calculate the character length of the condition
1027
24.7k
int AffixMgr::condlen(const std::string& s) {
1028
24.7k
  int l = 0;
1029
24.7k
  bool group = false;
1030
24.7k
  auto st = s.begin(), end = s.end();
1031
3.76M
  while (st != end) {
1032
3.74M
    if (*st == '[') {
1033
41.8k
      group = true;
1034
41.8k
      l++;
1035
3.70M
    } else if (*st == ']')
1036
16.9k
      group = false;
1037
3.68M
    else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1038
3.11M
      l++;
1039
3.74M
    ++st;
1040
3.74M
  }
1041
24.7k
  return l;
1042
24.7k
}
1043
1044
27.8k
int AffixMgr::encodeit(AffEntry& entry, const std::string& cs) {
1045
27.8k
  if (cs.compare(".") != 0) {
1046
24.7k
    entry.numconds = (char)condlen(cs);
1047
24.7k
    const size_t cslen = cs.size();
1048
24.7k
    const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
1049
24.7k
    memcpy(entry.c.conds, cs.data(), short_part);
1050
24.7k
    if (short_part < MAXCONDLEN) {
1051
      //blank out the remaining space
1052
13.8k
      memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
1053
13.8k
    } else if (cs[MAXCONDLEN]) {
1054
      //there is more conditions than fit in fixed space, so its
1055
      //a long condition
1056
9.16k
      entry.opts |= aeLONGCOND;
1057
9.16k
      size_t remaining = cs.size() - MAXCONDLEN_1;
1058
9.16k
      entry.c.l.conds2 = new char[1 + remaining];
1059
9.16k
      memcpy(entry.c.l.conds2, cs.data() + MAXCONDLEN_1, remaining);
1060
9.16k
      entry.c.l.conds2[remaining] = 0;
1061
9.16k
    }
1062
24.7k
  } else {
1063
3.11k
    entry.numconds = 0;
1064
3.11k
    entry.c.conds[0] = '\0';
1065
3.11k
  }
1066
27.8k
  return 0;
1067
27.8k
}
1068
1069
// return 1 if s1 is a leading subset of s2 (dots are for infixes)
1070
80.6M
inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1071
104M
  while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0') && (*s2 != '\0')) {
1072
24.2M
    s1++;
1073
24.2M
    s2++;
1074
24.2M
  }
1075
80.6M
  return (*s1 == '\0');
1076
80.6M
}
1077
1078
// check word for prefixes
1079
struct hentry* AffixMgr::prefix_check(const std::string& word,
1080
                                      int start,
1081
                                      int len,
1082
                                      char in_compound,
1083
1.32G
                                      const FLAG needflag) {
1084
1.32G
  struct hentry* rv = NULL;
1085
1086
1.32G
  pfx = NULL;
1087
1.32G
  pfxappnd = NULL;
1088
1.32G
  sfxappnd = NULL;
1089
1.32G
  sfxextra = 0;
1090
1091
  // first handle the special case of 0 length prefixes
1092
1.32G
  PfxEntry* pe = pStart[0];
1093
1.67G
  while (pe) {
1094
357M
    if (
1095
        // fogemorpheme
1096
357M
        ((in_compound != IN_CPD_NOT) ||
1097
357M
         !(pe->getCont() &&
1098
55.4M
           (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1099
        // permit prefixes in compounds
1100
357M
        ((in_compound != IN_CPD_END) ||
1101
355M
         (pe->getCont() &&
1102
343M
          (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
1103
      // check prefix
1104
343M
      rv = pe->checkword(word, start, len, in_compound, needflag);
1105
343M
      if (rv) {
1106
7.18M
        pfx = pe;  // BUG: pfx not stateless
1107
7.18M
        return rv;
1108
7.18M
      }
1109
343M
    }
1110
350M
    pe = pe->getNext();
1111
350M
  }
1112
1113
  // now handle the general case
1114
1.31G
  unsigned char sp = word[start];
1115
1.31G
  PfxEntry* pptr = pStart[sp];
1116
1117
1.38G
  while (pptr) {
1118
70.2M
    if (isSubset(pptr->getKey(), word.c_str() + start)) {
1119
68.3M
      if (
1120
          // fogemorpheme
1121
68.3M
          ((in_compound != IN_CPD_NOT) ||
1122
68.3M
           !(pptr->getCont() &&
1123
6.87M
             (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1124
          // permit prefixes in compounds
1125
68.3M
          ((in_compound != IN_CPD_END) ||
1126
68.0M
           (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
1127
65.6M
                                        pptr->getContLen()))))) {
1128
        // check prefix
1129
65.6M
        rv = pptr->checkword(word, start, len, in_compound, needflag);
1130
65.6M
        if (rv) {
1131
285k
          pfx = pptr;  // BUG: pfx not stateless
1132
285k
          return rv;
1133
285k
        }
1134
65.6M
      }
1135
68.0M
      pptr = pptr->getNextEQ();
1136
68.0M
    } else {
1137
1.91M
      pptr = pptr->getNextNE();
1138
1.91M
    }
1139
70.2M
  }
1140
1141
1.31G
  return NULL;
1142
1.31G
}
1143
1144
// check word for prefixes and two-level suffixes
1145
struct hentry* AffixMgr::prefix_check_twosfx(const std::string& word,
1146
                                             int start,
1147
                                             int len,
1148
                                             char in_compound,
1149
50.6M
                                             const FLAG needflag) {
1150
50.6M
  struct hentry* rv = NULL;
1151
1152
50.6M
  pfx = NULL;
1153
50.6M
  sfxappnd = NULL;
1154
50.6M
  sfxextra = 0;
1155
1156
  // first handle the special case of 0 length prefixes
1157
50.6M
  PfxEntry* pe = pStart[0];
1158
1159
114M
  while (pe) {
1160
64.2M
    rv = pe->check_twosfx(word, start, len, in_compound, needflag);
1161
64.2M
    if (rv)
1162
8.61k
      return rv;
1163
64.2M
    pe = pe->getNext();
1164
64.2M
  }
1165
1166
  // now handle the general case
1167
50.6M
  unsigned char sp = word[start];
1168
50.6M
  PfxEntry* pptr = pStart[sp];
1169
1170
60.1M
  while (pptr) {
1171
9.46M
    if (isSubset(pptr->getKey(), word.c_str() + start)) {
1172
8.57M
      rv = pptr->check_twosfx(word, start, len, in_compound, needflag);
1173
8.57M
      if (rv) {
1174
886
        pfx = pptr;
1175
886
        return rv;
1176
886
      }
1177
8.57M
      pptr = pptr->getNextEQ();
1178
8.57M
    } else {
1179
890k
      pptr = pptr->getNextNE();
1180
890k
    }
1181
9.46M
  }
1182
1183
50.6M
  return NULL;
1184
50.6M
}
1185
1186
// check word for prefixes and morph
1187
std::string AffixMgr::prefix_check_morph(const std::string& word,
1188
                                         int start,
1189
                                         int len,
1190
                                         char in_compound,
1191
0
                                         const FLAG needflag) {
1192
1193
0
  std::string result;
1194
1195
0
  pfx = NULL;
1196
0
  sfxappnd = NULL;
1197
0
  sfxextra = 0;
1198
1199
  // first handle the special case of 0 length prefixes
1200
0
  PfxEntry* pe = pStart[0];
1201
0
  while (pe) {
1202
0
    std::string st = pe->check_morph(word, start, len, in_compound, needflag);
1203
0
    if (!st.empty()) {
1204
0
      result.append(st);
1205
0
    }
1206
0
    pe = pe->getNext();
1207
0
  }
1208
1209
  // now handle the general case
1210
0
  unsigned char sp = word[start];
1211
0
  PfxEntry* pptr = pStart[sp];
1212
1213
0
  while (pptr) {
1214
0
    if (isSubset(pptr->getKey(), word.c_str() + start)) {
1215
0
      std::string st = pptr->check_morph(word, start, len, in_compound, needflag);
1216
0
      if (!st.empty()) {
1217
        // fogemorpheme
1218
0
        if ((in_compound != IN_CPD_NOT) ||
1219
0
            !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
1220
0
                                           pptr->getContLen()))))) {
1221
0
          result.append(st);
1222
0
          pfx = pptr;
1223
0
        }
1224
0
      }
1225
0
      pptr = pptr->getNextEQ();
1226
0
    } else {
1227
0
      pptr = pptr->getNextNE();
1228
0
    }
1229
0
  }
1230
1231
0
  return result;
1232
0
}
1233
1234
// check word for prefixes and morph and two-level suffixes
1235
std::string AffixMgr::prefix_check_twosfx_morph(const std::string& word,
1236
                                                int start,
1237
                                                int len,
1238
                                                char in_compound,
1239
0
                                                const FLAG needflag) {
1240
0
  std::string result;
1241
1242
0
  pfx = NULL;
1243
0
  sfxappnd = NULL;
1244
0
  sfxextra = 0;
1245
1246
  // first handle the special case of 0 length prefixes
1247
0
  PfxEntry* pe = pStart[0];
1248
0
  while (pe) {
1249
0
    std::string st = pe->check_twosfx_morph(word, start, len, in_compound, needflag);
1250
0
    if (!st.empty()) {
1251
0
      result.append(st);
1252
0
    }
1253
0
    pe = pe->getNext();
1254
0
  }
1255
1256
  // now handle the general case
1257
0
  unsigned char sp = word[start];
1258
0
  PfxEntry* pptr = pStart[sp];
1259
1260
0
  while (pptr) {
1261
0
    if (isSubset(pptr->getKey(), word.c_str() + start)) {
1262
0
      std::string st = pptr->check_twosfx_morph(word, start, len, in_compound, needflag);
1263
0
      if (!st.empty()) {
1264
0
        result.append(st);
1265
0
        pfx = pptr;
1266
0
      }
1267
0
      pptr = pptr->getNextEQ();
1268
0
    } else {
1269
0
      pptr = pptr->getNextNE();
1270
0
    }
1271
0
  }
1272
1273
0
  return result;
1274
0
}
1275
1276
// Is word a non-compound with a REP substitution (see checkcompoundrep)?
1277
204k
int AffixMgr::cpdrep_check(const std::string& in_word, int wl) {
1278
1279
204k
  if ((wl < 2) || get_reptable().empty())
1280
168k
    return 0;
1281
1282
35.6k
  std::string word(in_word, 0, wl);
1283
1284
428k
  for (const auto& i : get_reptable()) {
1285
    // use only available mid patterns
1286
428k
    if (!i.outstrings[0].empty()) {
1287
427k
      size_t r = 0;
1288
427k
      const size_t lenp = i.pattern.size();
1289
      // search every occurence of the pattern in the word
1290
1.45M
      while ((r = word.find(i.pattern, r)) != std::string::npos) {
1291
1.03M
        std::string candidate(word);
1292
1.03M
        candidate.replace(r, lenp, i.outstrings[0]);
1293
1.03M
        if (candidate_check(candidate))
1294
9.56k
          return 1;
1295
1.02M
        ++r;  // search for the next letter
1296
1.02M
      }
1297
427k
    }
1298
428k
  }
1299
1300
26.1k
 return 0;
1301
35.6k
}
1302
1303
// forbid compound words, if they are in the dictionary as a
1304
// word pair separated by space
1305
802k
int AffixMgr::cpdwordpair_check(const std::string& word, int wl) {
1306
802k
  if (wl > 2) {
1307
713k
    std::string candidate(word, 0, wl);
1308
17.1M
    for (size_t i = 1; i < candidate.size(); i++) {
1309
      // go to end of the UTF-8 character
1310
16.5M
      if (utf8 && ((candidate[i] & 0xc0) == 0x80))
1311
270k
          continue;
1312
16.2M
      candidate.insert(i, 1, ' ');
1313
16.2M
      if (candidate_check(candidate))
1314
85.6k
        return 1;
1315
16.1M
      candidate.erase(i, 1);
1316
16.1M
    }
1317
713k
  }
1318
1319
717k
  return 0;
1320
802k
}
1321
1322
// forbid compoundings when there are special patterns at word bound
1323
int AffixMgr::cpdpat_check(const std::string& word,
1324
                           size_t pos,
1325
                           hentry* r1,
1326
                           hentry* r2,
1327
458k
                           const char /*affixed*/) {
1328
464k
  for (auto& i : checkcpdtable) {
1329
464k
    size_t len;
1330
464k
    if (isSubset(i.pattern2.c_str(), word.c_str() + pos) &&
1331
464k
        (!r1 || !i.cond ||
1332
108k
         (r1->astr && TESTAFF(r1->astr, i.cond, r1->alen))) &&
1333
464k
        (!r2 || !i.cond2 ||
1334
101k
         (r2->astr && TESTAFF(r2->astr, i.cond2, r2->alen))) &&
1335
        // zero length pattern => only TESTAFF
1336
        // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1337
464k
        (i.pattern.empty() ||
1338
91.3k
         ((i.pattern[0] == '0' && r1->blen <= pos &&
1339
79.0k
           strncmp(word.c_str() + pos - r1->blen, r1->word, r1->blen) == 0) ||
1340
79.0k
          (i.pattern[0] != '0' &&
1341
78.4k
           ((len = i.pattern.size()) != 0) && len <= pos &&
1342
78.4k
           strncmp(word.c_str() + pos - len, i.pattern.c_str(), len) == 0)))) {
1343
40.8k
      return 1;
1344
40.8k
    }
1345
464k
  }
1346
417k
  return 0;
1347
458k
}
1348
1349
// forbid compounding with neighbouring upper and lower case characters at word
1350
// bounds
1351
636k
int AffixMgr::cpdcase_check(const std::string& word, int pos) {
1352
636k
  if (utf8) {
1353
90.2k
    const char* p;
1354
90.2k
    const char* wordp = word.c_str();
1355
94.9k
    for (p = wordp + pos - 1; p > wordp && (*p & 0xc0) == 0x80; p--)
1356
4.74k
      ;
1357
90.2k
    std::string pair(p);
1358
90.2k
    std::vector<w_char> pair_u;
1359
90.2k
    u8_u16(pair_u, pair);
1360
90.2k
    unsigned short a = pair_u.size() > 1 ? (unsigned short)pair_u[1] : 0,
1361
90.2k
                   b = !pair_u.empty() ? (unsigned short)pair_u[0] : 0;
1362
90.2k
    if (((unicodetoupper(a, langnum) == a) ||
1363
90.2k
         (unicodetoupper(b, langnum) == b)) &&
1364
90.2k
        (a != '-') && (b != '-'))
1365
84.9k
      return 1;
1366
545k
  } else {
1367
545k
    const unsigned char a = word[pos - 1], b = word[pos];
1368
545k
    if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1369
22.9k
      return 1;
1370
545k
  }
1371
528k
  return 0;
1372
636k
}
1373
1374
struct metachar_data {
1375
  signed short btpp;  // metacharacter (*, ?) position for backtracking
1376
  signed short btwp;  // word position for metacharacters
1377
  int btnum;          // number of matched characters in metacharacter
1378
};
1379
1380
// check compound patterns
1381
int AffixMgr::defcpd_check(hentry*** words,
1382
                           short wnum,
1383
                           hentry* rv,
1384
                           hentry** def,
1385
3.43M
                           char all) {
1386
3.43M
  int w = 0;
1387
1388
3.43M
  if (!*words) {
1389
3.28M
    w = 1;
1390
3.28M
    *words = def;
1391
3.28M
  }
1392
1393
3.43M
  if (!*words) {
1394
0
    return 0;
1395
0
  }
1396
1397
3.43M
  std::vector<metachar_data> btinfo(1);
1398
1399
3.43M
  short bt = 0;
1400
1401
3.43M
  (*words)[wnum] = rv;
1402
1403
  // has the last word COMPOUNDRULE flag?
1404
3.43M
  if (rv->alen == 0) {
1405
2.43M
    (*words)[wnum] = NULL;
1406
2.43M
    if (w)
1407
2.39M
      *words = NULL;
1408
2.43M
    return 0;
1409
2.43M
  }
1410
1.00M
  int ok = 0;
1411
2.33M
  for (auto& i : defcpdtable) {
1412
8.96M
    for (auto& j : i) {
1413
8.96M
      if (j != '*' && j != '?' &&
1414
8.96M
          TESTAFF(rv->astr, j, rv->alen)) {
1415
923k
        ok = 1;
1416
923k
        break;
1417
923k
      }
1418
8.96M
    }
1419
2.33M
  }
1420
1.00M
  if (ok == 0) {
1421
120k
    (*words)[wnum] = NULL;
1422
120k
    if (w)
1423
117k
      *words = NULL;
1424
120k
    return 0;
1425
120k
  }
1426
1427
1.44M
  for (auto& i : defcpdtable) {
1428
1.44M
    size_t pp = 0;  // pattern position
1429
1.44M
    signed short wp = 0;  // "words" position
1430
1.44M
    int ok2 = 1;
1431
1.44M
    ok = 1;
1432
1.80M
    do {
1433
2.69M
      while ((pp < i.size()) && (wp <= wnum)) {
1434
2.08M
        if (((pp + 1) < i.size()) &&
1435
2.08M
            ((i[pp + 1] == '*') ||
1436
1.90M
             (i[pp + 1] == '?'))) {
1437
546k
          int wend = (i[pp + 1] == '?') ? wp : wnum;
1438
546k
          ok2 = 1;
1439
546k
          pp += 2;
1440
546k
          btinfo[bt].btpp = pp;
1441
546k
          btinfo[bt].btwp = wp;
1442
916k
          while (wp <= wend) {
1443
592k
            if (!(*words)[wp] ||
1444
592k
                !(*words)[wp]->alen ||
1445
592k
                !TESTAFF((*words)[wp]->astr, i[pp - 2],
1446
592k
                         (*words)[wp]->alen)) {
1447
221k
              ok2 = 0;
1448
221k
              break;
1449
221k
            }
1450
370k
            wp++;
1451
370k
          }
1452
546k
          if (wp <= wnum)
1453
221k
            ok2 = 0;
1454
546k
          btinfo[bt].btnum = wp - btinfo[bt].btwp;
1455
546k
          if (btinfo[bt].btnum > 0) {
1456
327k
            ++bt;
1457
327k
            btinfo.resize(bt+1);
1458
327k
          }
1459
546k
          if (ok2)
1460
324k
            break;
1461
1.54M
        } else {
1462
1.54M
          ok2 = 1;
1463
1.54M
          if (!(*words)[wp] || !(*words)[wp]->alen ||
1464
1.54M
              !TESTAFF((*words)[wp]->astr, i[pp],
1465
1.54M
                       (*words)[wp]->alen)) {
1466
873k
            ok = 0;
1467
873k
            break;
1468
873k
          }
1469
669k
          pp++;
1470
669k
          wp++;
1471
669k
          if ((i.size() == pp) && !(wp > wnum))
1472
4.86k
            ok = 0;
1473
669k
        }
1474
2.08M
      }
1475
1.80M
      if (ok && ok2) {
1476
927k
        size_t r = pp;
1477
1.56M
        while ((i.size() > r) && ((r + 1) < i.size()) &&
1478
1.56M
               ((i[r + 1] == '*') ||
1479
1.25M
                (i[r + 1] == '?')))
1480
635k
          r += 2;
1481
927k
        if (i.size() <= r)
1482
111k
          return 1;
1483
927k
      }
1484
      // backtrack
1485
1.69M
      if (bt)
1486
686k
        do {
1487
686k
          ok = 1;
1488
686k
          btinfo[bt - 1].btnum--;
1489
686k
          pp = btinfo[bt - 1].btpp;
1490
686k
          wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1491
686k
        } while ((btinfo[bt - 1].btnum < 0) && --bt);
1492
1.69M
    } while (bt);
1493
1494
1.33M
    if (ok && ok2 && (!all || (i.size() <= pp)))
1495
694k
      return 1;
1496
1497
    // check zero ending
1498
637k
    while (ok && ok2 && (i.size() > pp) &&
1499
637k
           ((pp + 1) < i.size()) &&
1500
637k
           ((i[pp + 1] == '*') ||
1501
2.60k
            (i[pp + 1] == '?')))
1502
1.32k
      pp += 2;
1503
635k
    if (ok && ok2 && (i.size() <= pp))
1504
0
      return 1;
1505
635k
  }
1506
78.8k
  (*words)[wnum] = NULL;
1507
78.8k
  if (w)
1508
51.9k
    *words = NULL;
1509
78.8k
  return 0;
1510
883k
}
1511
1512
17.2M
inline int AffixMgr::candidate_check(const std::string& word) {
1513
1514
17.2M
  struct hentry* rv = lookup(word.c_str(), word.size());
1515
17.2M
  if (rv)
1516
36.3k
    return 1;
1517
1518
  //  rv = prefix_check(word,0,len,1);
1519
  //  if (rv) return 1;
1520
1521
17.2M
  rv = affix_check(word, 0, word.size());
1522
17.2M
  if (rv)
1523
58.8k
    return 1;
1524
17.1M
  return 0;
1525
17.2M
}
1526
1527
// calculate number of syllable for compound-checking
1528
15.7M
short AffixMgr::get_syllable(const std::string& word) {
1529
15.7M
  if (cpdmaxsyllable == 0)
1530
14.8M
    return 0;
1531
1532
935k
  short num = 0;
1533
1534
935k
  if (!utf8) {
1535
932k
    num = (short)std::count_if(word.begin(), word.end(), 
1536
7.92M
          [&](char c) {
1537
7.92M
            return std::binary_search(cpdvowels.begin(), cpdvowels.end(), c);
1538
7.92M
          });
1539
932k
  } else if (!cpdvowels_utf16.empty()) {
1540
3.08k
    std::vector<w_char> w;
1541
3.08k
    u8_u16(w, word);
1542
3.08k
    num = (short)std::count_if(w.begin(), w.end(),
1543
23.0k
          [&](w_char wc) {
1544
23.0k
            return std::binary_search(cpdvowels_utf16.begin(), cpdvowels_utf16.end(), wc);
1545
23.0k
          });
1546
3.08k
  }
1547
1548
935k
  return num;
1549
15.7M
}
1550
1551
40.3M
void AffixMgr::setcminmax(size_t* cmin, size_t* cmax, const char* word, size_t len) {
1552
40.3M
  if (utf8) {
1553
6.40M
    int i;
1554
46.8M
    for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1555
47.3M
      for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1556
6.88M
        ;
1557
40.4M
    }
1558
40.6M
    for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax > 0; i++) {
1559
41.1M
      for ((*cmax)--; *cmax > 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1560
6.87M
        ;
1561
34.2M
    }
1562
33.8M
  } else {
1563
33.8M
    *cmin = cpdmin;
1564
33.8M
    *cmax = len - cpdmin + 1;
1565
33.8M
  }
1566
40.3M
}
1567
1568
// check if compound word is correctly spelled
1569
// hu_mov_rule = spec. Hungarian rule (XXX)
1570
struct hentry* AffixMgr::compound_check(const std::string& word,
1571
                                        short wordnum,
1572
                                        short numsyllable,
1573
                                        short maxwordnum,
1574
                                        short wnum,
1575
                                        hentry** words = NULL,
1576
                                        hentry** rwords = NULL,
1577
                                        char hu_mov_rule = 0,
1578
                                        char is_sug = 0,
1579
30.7M
                                        int* info = NULL) {
1580
30.7M
  short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1581
30.7M
  hentry *rv = NULL, *rv_first;
1582
30.7M
  std::string st;
1583
30.7M
  char ch = '\0', affixed;
1584
30.7M
  size_t cmin, cmax;
1585
30.7M
  int striple = 0, soldi = 0, oldcmin = 0, oldcmax = 0, oldlen = 0, checkedstriple = 0;
1586
30.7M
  hentry** oldwords = words;
1587
30.7M
  size_t scpd = 0, len = word.size();
1588
1589
30.7M
  int checked_prefix;
1590
1591
  // add a time limit to handle possible
1592
  // combinatorical explosion of the overlapping words
1593
1594
30.7M
  HUNSPELL_THREAD_LOCAL std::chrono::steady_clock::time_point clock_time_start;
1595
30.7M
  HUNSPELL_THREAD_LOCAL bool timelimit_exceeded;
1596
1597
  // get the current time
1598
30.7M
  std::chrono::steady_clock::time_point clock_now = std::chrono::steady_clock::now();
1599
1600
30.7M
  if (wordnum == 0) {
1601
      // set the start time
1602
16.2M
      clock_time_start = clock_now;
1603
16.2M
      timelimit_exceeded = false;
1604
16.2M
  }
1605
14.5M
  else if (std::chrono::duration_cast<std::chrono::milliseconds>(clock_now - clock_time_start).count()
1606
14.5M
            > static_cast<double>(TIMELIMIT) * CLOCKS_PER_SEC * 1000)
1607
0
      timelimit_exceeded = true;
1608
1609
30.7M
  setcminmax(&cmin, &cmax, word.c_str(), len);
1610
1611
30.7M
  st.assign(word);
1612
1613
1.17G
  for (size_t i = cmin; i < cmax; ++i) {
1614
    // go to end of the UTF-8 character
1615
1.14G
    if (utf8) {
1616
580M
      for (; (st[i] & 0xc0) == 0x80; i++)
1617
241M
        ;
1618
338M
      if (i >= cmax)
1619
546k
        return NULL;
1620
338M
    }
1621
1622
1.14G
    words = oldwords;
1623
1.14G
    int onlycpdrule = (words) ? 1 : 0;
1624
1625
1.23G
    do {  // onlycpdrule loop
1626
1627
1.23G
      oldnumsyllable = numsyllable;
1628
1.23G
      oldwordnum = wordnum;
1629
1.23G
      checked_prefix = 0;
1630
1631
1.29G
      do {  // simplified checkcompoundpattern loop
1632
1633
1.29G
        if (timelimit_exceeded)
1634
0
          return 0;
1635
1636
1.29G
        if (scpd > 0) {
1637
128M
          for (; scpd <= checkcpdtable.size() &&
1638
128M
                 (checkcpdtable[scpd - 1].pattern3.empty() ||
1639
81.4M
                  i > word.size() ||
1640
81.4M
                  word.compare(i, checkcpdtable[scpd - 1].pattern3.size(), checkcpdtable[scpd - 1].pattern3) != 0);
1641
71.9M
               scpd++)
1642
71.9M
            ;
1643
1644
56.4M
          if (scpd > checkcpdtable.size())
1645
46.9M
            break;  // break simplified checkcompoundpattern loop
1646
9.50M
          st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1647
9.50M
          soldi = i;
1648
9.50M
          i += checkcpdtable[scpd - 1].pattern.size();
1649
9.50M
          st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1650
9.50M
          st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1651
9.50M
                 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1652
1653
9.50M
          oldlen = len;
1654
9.50M
          len += checkcpdtable[scpd - 1].pattern.size() +
1655
9.50M
                 checkcpdtable[scpd - 1].pattern2.size() -
1656
9.50M
                 checkcpdtable[scpd - 1].pattern3.size();
1657
9.50M
          oldcmin = cmin;
1658
9.50M
          oldcmax = cmax;
1659
9.50M
          setcminmax(&cmin, &cmax, st.c_str(), len);
1660
1661
9.50M
          cmax = len - cpdmin + 1;
1662
9.50M
        }
1663
1664
1.24G
  if (i > st.size())
1665
227k
      return NULL;
1666
1667
1.24G
        ch = st[i];
1668
1.24G
        st[i] = '\0';
1669
1670
1.24G
        sfx = NULL;
1671
1.24G
        pfx = NULL;
1672
1673
        // FIRST WORD
1674
1675
1.24G
        affixed = 1;
1676
1.24G
        rv = lookup(st.c_str(), i);  // perhaps without prefix
1677
1678
        // forbid dictionary stems with COMPOUNDFORBIDFLAG in
1679
        // compound words, overriding the effect of COMPOUNDPERMITFLAG
1680
1.24G
        if ((rv) && compoundforbidflag &&
1681
1.24G
                TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) {
1682
72.3k
            bool would_continue = !onlycpdrule && simplifiedcpd;
1683
72.3k
            if (!scpd && would_continue) {
1684
                // given the while conditions that continue jumps to, this situation
1685
                // never ends
1686
52.8k
                HUNSPELL_WARNING(stderr, "break infinite loop\n");
1687
52.8k
                break;
1688
52.8k
            }
1689
1690
19.4k
            if (scpd > 0 && would_continue) {
1691
                // under these conditions we loop again, but the assumption above
1692
                // appears to be that cmin and cmax are the original values they
1693
                // had in the outside loop
1694
2.24k
                cmin = oldcmin;
1695
2.24k
                cmax = oldcmax;
1696
2.24k
            }
1697
19.4k
            continue;
1698
72.3k
        }
1699
1700
        // search homonym with compound flag
1701
1.32G
        while ((rv) && !hu_mov_rule &&
1702
1.32G
               ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1703
89.5M
                !((compoundflag && !words && !onlycpdrule &&
1704
89.5M
                   TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1705
89.5M
                  (compoundbegin && !wordnum && !onlycpdrule &&
1706
77.0M
                   TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1707
89.5M
                  (compoundmiddle && wordnum && !words && !onlycpdrule &&
1708
76.6M
                   TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1709
89.5M
                  (!defcpdtable.empty() && onlycpdrule &&
1710
76.4M
                   ((!words && !wordnum &&
1711
3.43M
                     defcpd_check(&words, wnum, rv, rwords, 0)) ||
1712
3.43M
                    (words &&
1713
2.71M
                     defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1714
89.5M
                (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
1715
75.7M
                 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
1716
75.7M
          rv = rv->next_homonym;
1717
75.7M
        }
1718
1719
1.24G
        if (rv)
1720
13.8M
          affixed = 0;
1721
1722
1.24G
        if (!rv) {
1723
1.23G
          if (onlycpdrule)
1724
107M
            break;
1725
1.12G
          if (compoundflag &&
1726
1.12G
              !(rv = prefix_check(st, 0, i,
1727
905M
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1728
905M
                                  compoundflag))) {
1729
898M
            if (((rv = suffix_check(
1730
898M
                      st, 0, i, 0, NULL, FLAG_NULL, compoundflag,
1731
898M
                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1732
898M
                 (compoundmoresuffixes &&
1733
898M
                  (rv = suffix_check_twosfx(st, 0, i, 0, NULL, compoundflag)))) &&
1734
898M
                !hu_mov_rule && sfx->getCont() &&
1735
898M
                ((compoundforbidflag &&
1736
27.3k
                  TESTAFF(sfx->getCont(), compoundforbidflag,
1737
27.3k
                          sfx->getContLen())) ||
1738
27.3k
                 (compoundend &&
1739
25.7k
                  TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1740
2.03k
              rv = NULL;
1741
2.03k
            }
1742
898M
          }
1743
1744
1.12G
          if (rv ||
1745
1.12G
              (((wordnum == 0) && compoundbegin &&
1746
1.11G
                ((rv = suffix_check(
1747
149M
                      st, 0, i, 0, NULL, FLAG_NULL, compoundbegin,
1748
149M
                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1749
149M
                 (compoundmoresuffixes &&
1750
149M
                  (rv = suffix_check_twosfx(
1751
12.8M
                       st, 0, i, 0, NULL,
1752
12.8M
                       compoundbegin))) ||  // twofold suffixes + compound
1753
149M
                 (rv = prefix_check(st, 0, i,
1754
149M
                                    hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1755
149M
                                    compoundbegin)))) ||
1756
1.11G
               ((wordnum > 0) && compoundmiddle &&
1757
1.11G
                ((rv = suffix_check(
1758
9.43M
                      st, 0, i, 0, NULL, FLAG_NULL, compoundmiddle,
1759
9.43M
                      hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1760
9.43M
                 (compoundmoresuffixes &&
1761
9.42M
                  (rv = suffix_check_twosfx(
1762
5.25M
                       st, 0, i, 0, NULL,
1763
5.25M
                       compoundmiddle))) ||  // twofold suffixes + compound
1764
9.43M
                 (rv = prefix_check(st, 0, i,
1765
9.41M
                                    hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1766
9.41M
                                    compoundmiddle))))))
1767
7.28M
            checked_prefix = 1;
1768
          // else check forbiddenwords and needaffix
1769
1.12G
        } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1770
13.8M
                                TESTAFF(rv->astr, needaffix, rv->alen) ||
1771
13.8M
                                TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1772
13.8M
                                (is_sug && nosuggest &&
1773
12.5M
                                 TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1774
1.27M
          st[i] = ch;
1775
          // continue;
1776
1.27M
          break;
1777
1.27M
        }
1778
1779
        // check non_compound flag in suffix and prefix
1780
1.13G
        if ((rv) && !hu_mov_rule &&
1781
1.13G
            ((pfx && pfx->getCont() &&
1782
19.8M
              TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
1783
19.8M
             (sfx && sfx->getCont() &&
1784
19.7M
              TESTAFF(sfx->getCont(), compoundforbidflag,
1785
19.7M
                      sfx->getContLen())))) {
1786
29.5k
          rv = NULL;
1787
29.5k
        }
1788
1789
        // check compoundend flag in suffix and prefix
1790
1.13G
        if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1791
1.13G
            ((pfx && pfx->getCont() &&
1792
292k
              TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
1793
292k
             (sfx && sfx->getCont() &&
1794
292k
              TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1795
0
          rv = NULL;
1796
0
        }
1797
1798
        // check compoundmiddle flag in suffix and prefix
1799
1.13G
        if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1800
1.13G
            !hu_mov_rule &&
1801
1.13G
            ((pfx && pfx->getCont() &&
1802
264k
              TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
1803
264k
             (sfx && sfx->getCont() &&
1804
264k
              TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
1805
0
          rv = NULL;
1806
0
        }
1807
1808
        // check forbiddenwords
1809
1.13G
        if ((rv) && (rv->astr) &&
1810
1.13G
            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1811
19.8M
             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1812
19.8M
             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1813
392k
          return NULL;
1814
392k
        }
1815
1816
        // increment word number, if the second root has a compoundroot flag
1817
1.13G
        if ((rv) && compoundroot &&
1818
1.13G
            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1819
139k
          wordnum++;
1820
139k
        }
1821
1822
        // first word is acceptable in compound words?
1823
1.13G
        if (((rv) &&
1824
1.13G
             (checked_prefix || (words && words[wnum]) ||
1825
19.4M
              (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1826
19.4M
              ((oldwordnum == 0) && compoundbegin &&
1827
401k
               TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1828
19.4M
              ((oldwordnum > 0) && compoundmiddle &&
1829
105k
               TESTAFF(rv->astr, compoundmiddle, rv->alen))
1830
1831
              // LANG_hu section: spec. Hungarian rule
1832
19.4M
              || ((langnum == LANG_hu) && hu_mov_rule &&
1833
10.9k
                  (TESTAFF(
1834
10.6k
                       rv->astr, 'F',
1835
10.6k
                       rv->alen) ||  // XXX hardwired Hungarian dictionary codes
1836
10.6k
                   TESTAFF(rv->astr, 'G', rv->alen) ||
1837
10.6k
                   TESTAFF(rv->astr, 'H', rv->alen)))
1838
              // END of LANG_hu section
1839
19.4M
              ) &&
1840
1.13G
             (
1841
                 // test CHECKCOMPOUNDPATTERN conditions
1842
19.4M
                 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
1843
19.4M
                 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
1844
1.13G
             !((checkcompoundtriple && scpd == 0 &&
1845
19.4M
                !words && i < word.size() && // test triple letters
1846
19.4M
                (word[i - 1] == word[i]) &&
1847
19.4M
                (((i > 1) && (word[i - 1] == word[i - 2])) ||
1848
81.1k
                 ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
1849
81.1k
                 )) ||
1850
19.4M
               (checkcompoundcase && scpd == 0 && !words && i < word.size() &&
1851
19.3M
                cpdcase_check(word, i))))
1852
            // LANG_hu section: spec. Hungarian rule
1853
1.13G
            || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1854
1.11G
                (rv = affix_check(st, 0, i)) &&
1855
1.11G
                (sfx && sfx->getCont() &&
1856
9.45k
                 (  // XXX hardwired Hungarian dic. codes
1857
4.18k
                     TESTAFF(sfx->getCont(), (unsigned short)'x',
1858
4.18k
                             sfx->getContLen()) ||
1859
4.18k
                     TESTAFF(
1860
4.18k
                         sfx->getCont(), (unsigned short)'%',
1861
19.2M
                         sfx->getContLen()))))) {  // first word is ok condition
1862
1863
          // LANG_hu section: spec. Hungarian rule
1864
19.2M
          if (langnum == LANG_hu) {
1865
            // calculate syllable number of the word
1866
5.59M
            numsyllable += get_syllable(st.substr(0, i));
1867
            // + 1 word, if syllable number of the prefix > 1 (hungarian
1868
            // convention)
1869
5.59M
            if (pfx && (get_syllable(pfx->getKey()) > 1))
1870
15.4k
              wordnum++;
1871
5.59M
          }
1872
          // END of LANG_hu section
1873
1874
          // NEXT WORD(S)
1875
19.2M
          rv_first = rv;
1876
19.2M
          st[i] = ch;
1877
1878
19.4M
          do {  // striple loop
1879
1880
            // check simplifiedtriple
1881
19.4M
            if (simplifiedtriple) {
1882
489k
              if (striple) {
1883
203k
                checkedstriple = 1;
1884
203k
                i--;  // check "fahrt" instead of "ahrt" in "Schiffahrt"
1885
285k
              } else if (i > 2 && i <= word.size() && word[i - 1] == word[i - 2])
1886
205k
                striple = 1;
1887
489k
            }
1888
1889
19.4M
            rv = lookup(st.c_str() + i, st.size() - i);  // perhaps without prefix
1890
1891
            // search homonym with compound flag
1892
20.4M
            while ((rv) &&
1893
20.4M
                   ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1894
1.20M
                    !((compoundflag && !words &&
1895
1.20M
                       TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1896
1.20M
                      (compoundend && !words &&
1897
906k
                       TESTAFF(rv->astr, compoundend, rv->alen)) ||
1898
1.20M
                      (!defcpdtable.empty() && words &&
1899
890k
                       defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
1900
1.20M
                    (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
1901
317k
                     !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
1902
1.03M
                              rv->alen)))) {
1903
1.03M
              rv = rv->next_homonym;
1904
1.03M
            }
1905
1906
            // check FORCEUCASE
1907
19.4M
            if (rv && forceucase &&
1908
19.4M
                (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1909
19.4M
                !(info && *info & SPELL_ORIGCAP))
1910
4.56k
              rv = NULL;
1911
1912
19.4M
            if (rv && words && words[wnum + 1])
1913
538
              return rv_first;
1914
1915
19.4M
            oldnumsyllable2 = numsyllable;
1916
19.4M
            oldwordnum2 = wordnum;
1917
1918
            // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1919
            // code
1920
19.4M
            if ((rv) && (langnum == LANG_hu) &&
1921
19.4M
                (TESTAFF(rv->astr, 'I', rv->alen)) &&
1922
19.4M
                !(TESTAFF(rv->astr, 'J', rv->alen))) {
1923
18.5k
              numsyllable--;
1924
18.5k
            }
1925
            // END of LANG_hu section
1926
1927
            // increment word number, if the second root has a compoundroot flag
1928
19.4M
            if ((rv) && (compoundroot) &&
1929
19.4M
                (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1930
6.20k
              wordnum++;
1931
6.20k
            }
1932
1933
            // check forbiddenwords
1934
19.4M
            if ((rv) && (rv->astr) &&
1935
19.4M
                (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1936
169k
                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1937
169k
                 (is_sug && nosuggest &&
1938
164k
                  TESTAFF(rv->astr, nosuggest, rv->alen))))
1939
5.09k
              return NULL;
1940
1941
            // second word is acceptable, as a root?
1942
            // hungarian conventions: compounding is acceptable,
1943
            // when compound forms consist of 2 words, or if more,
1944
            // then the syllable number of root words must be 6, or lesser.
1945
1946
19.4M
            if ((rv) &&
1947
19.4M
                ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1948
164k
                 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
1949
19.4M
                (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1950
164k
                 ((cpdmaxsyllable != 0) &&
1951
24.7k
                  (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
1952
17.6k
                   cpdmaxsyllable))) &&
1953
19.4M
                (
1954
                    // test CHECKCOMPOUNDPATTERN
1955
155k
                    checkcpdtable.empty() || scpd != 0 ||
1956
155k
                    (i < word.size() && !cpdpat_check(word, i, rv_first, rv, 0))) &&
1957
19.4M
                ((!checkcompounddup || (rv != rv_first)))
1958
                // test CHECKCOMPOUNDPATTERN conditions
1959
19.4M
                &&
1960
19.4M
                (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1961
130k
                 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
1962
              // forbid compound word, if it is a non-compound word with typical
1963
              // fault
1964
130k
              if ((checkcompoundrep && cpdrep_check(word, len)) ||
1965
130k
                      cpdwordpair_check(word, len))
1966
27.9k
                return NULL;
1967
102k
              return rv_first;
1968
130k
            }
1969
1970
19.3M
            numsyllable = oldnumsyllable2;
1971
19.3M
            wordnum = oldwordnum2;
1972
1973
            // perhaps second word has prefix or/and suffix
1974
19.3M
            sfx = NULL;
1975
19.3M
            sfxflag = FLAG_NULL;
1976
19.3M
            rv = (compoundflag && !onlycpdrule && i < word.size())
1977
19.3M
                     ? affix_check(word, i, word.size() - i, compoundflag,
1978
17.9M
                                   IN_CPD_END)
1979
19.3M
                     : NULL;
1980
19.3M
            if (!rv && compoundend && !onlycpdrule) {
1981
658k
              sfx = NULL;
1982
658k
              pfx = NULL;
1983
658k
              if (i < word.size())
1984
657k
                rv = affix_check(word, i, word.size() - i, compoundend, IN_CPD_END);
1985
658k
            }
1986
1987
19.3M
            if (!rv && !defcpdtable.empty() && words) {
1988
764k
              if (i < word.size())
1989
764k
                rv = affix_check(word, i, word.size() - i, 0, IN_CPD_END);
1990
764k
              if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
1991
43
                return rv_first;
1992
764k
              rv = NULL;
1993
764k
            }
1994
1995
            // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1996
19.3M
            if (rv &&
1997
19.3M
                !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1998
75.6k
                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
1999
5.69k
              rv = NULL;
2000
2001
            // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
2002
19.3M
            if (rv && !checkcpdtable.empty() && scpd == 0 &&
2003
19.3M
                cpdpat_check(word, i, rv_first, rv, affixed))
2004
4.87k
              rv = NULL;
2005
2006
            // check non_compound flag in suffix and prefix
2007
19.3M
            if ((rv) && ((pfx && pfx->getCont() &&
2008
65.0k
                          TESTAFF(pfx->getCont(), compoundforbidflag,
2009
65.0k
                                  pfx->getContLen())) ||
2010
65.0k
                         (sfx && sfx->getCont() &&
2011
63.9k
                          TESTAFF(sfx->getCont(), compoundforbidflag,
2012
63.9k
                                  sfx->getContLen())))) {
2013
1.42k
              rv = NULL;
2014
1.42k
            }
2015
2016
            // check FORCEUCASE
2017
19.3M
            if (rv && forceucase &&
2018
19.3M
                (TESTAFF(rv->astr, forceucase, rv->alen)) &&
2019
19.3M
                !(info && *info & SPELL_ORIGCAP))
2020
4.67k
              rv = NULL;
2021
2022
            // check forbiddenwords
2023
19.3M
            if ((rv) && (rv->astr) &&
2024
19.3M
                (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2025
58.3k
                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2026
58.3k
                 (is_sug && nosuggest &&
2027
51.0k
                  TESTAFF(rv->astr, nosuggest, rv->alen))))
2028
7.69k
              return NULL;
2029
2030
            // pfxappnd = prefix of word+i, or NULL
2031
            // calculate syllable number of prefix.
2032
            // hungarian convention: when syllable number of prefix is more,
2033
            // than 1, the prefix+word counts as two words.
2034
2035
19.3M
            if (langnum == LANG_hu) {
2036
5.53M
              if (i < word.size()) {
2037
                // calculate syllable number of the word
2038
5.37M
                numsyllable += get_syllable(word.substr(i));
2039
5.37M
              }
2040
2041
              // - affix syllable num.
2042
              // XXX only second suffix (inflections, not derivations)
2043
5.53M
              if (sfxappnd) {
2044
1.10k
                std::string tmp(sfxappnd);
2045
1.10k
                reverseword(tmp);
2046
1.10k
                numsyllable -= short(get_syllable(tmp) + sfxextra);
2047
5.53M
              } else {
2048
5.53M
                numsyllable -= short(sfxextra);
2049
5.53M
              }
2050
2051
              // + 1 word, if syllable number of the prefix > 1 (hungarian
2052
              // convention)
2053
5.53M
              if (pfx && (get_syllable(pfx->getKey()) > 1))
2054
722
                wordnum++;
2055
2056
              // increment syllable num, if last word has a SYLLABLENUM flag
2057
              // and the suffix is beginning `s'
2058
2059
5.53M
              if (!cpdsyllablenum.empty()) {
2060
253
                switch (sfxflag) {
2061
0
                  case 'c': {
2062
0
                    numsyllable += 2;
2063
0
                    break;
2064
0
                  }
2065
0
                  case 'J': {
2066
0
                    numsyllable += 1;
2067
0
                    break;
2068
0
                  }
2069
0
                  case 'I': {
2070
0
                    if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2071
0
                      numsyllable += 1;
2072
0
                    break;
2073
0
                  }
2074
253
                }
2075
253
              }
2076
5.53M
            }
2077
2078
            // increment word number, if the second word has a compoundroot flag
2079
19.3M
            if ((rv) && (compoundroot) &&
2080
19.3M
                (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2081
9.52k
              wordnum++;
2082
9.52k
            }
2083
            // second word is acceptable, as a word with prefix or/and suffix?
2084
            // hungarian conventions: compounding is acceptable,
2085
            // when compound forms consist 2 word, otherwise
2086
            // the syllable number of root words is 6, or lesser.
2087
19.3M
            if ((rv) &&
2088
19.3M
                (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2089
51.3k
                 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2090
19.3M
                ((!checkcompounddup || (rv != rv_first)))) {
2091
              // forbid compound word, if it is a non-compound word with typical
2092
              // fault
2093
29.4k
              if ((checkcompoundrep && cpdrep_check(word, len)) ||
2094
29.4k
                      cpdwordpair_check(word, len))
2095
11.6k
                return NULL;
2096
17.7k
              return rv_first;
2097
29.4k
            }
2098
2099
19.2M
            numsyllable = oldnumsyllable2;
2100
19.2M
            wordnum = oldwordnum2;
2101
2102
            // perhaps second word is a compound word (recursive call)
2103
            // (only if SPELL_COMPOUND_2 is not set and maxwordnum is not exceeded)
2104
19.2M
            if ((!info || !(*info & SPELL_COMPOUND_2)) && wordnum + 2 < maxwordnum) {
2105
14.5M
              rv = compound_check(st.substr(i), wordnum + 1,
2106
14.5M
                                  numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2107
14.5M
                                  is_sug, info);
2108
2109
14.5M
              if (rv && !checkcpdtable.empty() && i < word.size() &&
2110
14.5M
                  ((scpd == 0 &&
2111
397k
                    cpdpat_check(word, i, rv_first, rv, affixed)) ||
2112
397k
                   (scpd != 0 &&
2113
381k
                    !cpdpat_check(word, i, rv_first, rv, affixed))))
2114
25.5k
                rv = NULL;
2115
14.5M
            } else {
2116
4.76M
              rv = NULL;
2117
4.76M
            }
2118
19.2M
            if (rv) {
2119
              // forbid compound word, if it is a non-compound word with typical
2120
              // fault, or a dictionary word pair
2121
2122
553k
              if (cpdwordpair_check(word, len))
2123
49.9k
                  return NULL;
2124
2125
504k
              if (checkcompoundrep || forbiddenword) {
2126
2127
503k
                if (checkcompoundrep && cpdrep_check(word, len))
2128
626
                  return NULL;
2129
2130
                // check first part
2131
503k
                if (i < word.size() && word.compare(i, rv->blen, rv->word, rv->blen) == 0) {
2132
97.9k
                  char r = st[i + rv->blen];
2133
97.9k
                  st[i + rv->blen] = '\0';
2134
2135
97.9k
                  if ((checkcompoundrep && cpdrep_check(st, i + rv->blen)) ||
2136
97.9k
                      cpdwordpair_check(st, i + rv->blen)) {
2137
4.95k
                    st[ + i + rv->blen] = r;
2138
4.95k
                    continue;
2139
4.95k
                  }
2140
2141
93.0k
                  if (forbiddenword) {
2142
91.1k
                    struct hentry* rv2 = lookup(word.c_str(), word.size());
2143
91.1k
                    if (!rv2 && len <= word.size())
2144
80.5k
                      rv2 = affix_check(word, 0, len);
2145
91.1k
                    if (rv2 && rv2->astr &&
2146
91.1k
                        TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2147
91.1k
                        (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2148
113
                      return NULL;
2149
113
                    }
2150
91.1k
                  }
2151
92.9k
                  st[i + rv->blen] = r;
2152
92.9k
                }
2153
503k
              }
2154
498k
              return rv_first;
2155
504k
            }
2156
19.2M
          } while (striple && !checkedstriple);  // end of striple loop
2157
2158
18.5M
          if (checkedstriple) {
2159
202k
            i++;
2160
202k
            checkedstriple = 0;
2161
202k
            striple = 0;
2162
202k
          }
2163
2164
18.5M
        }  // first word is ok condition
2165
2166
1.13G
        if (soldi != 0) {
2167
9.45M
          i = soldi;
2168
9.45M
          soldi = 0;
2169
9.45M
          len = oldlen;
2170
9.45M
          cmin = oldcmin;
2171
9.45M
          cmax = oldcmax;
2172
9.45M
        }
2173
1.13G
        scpd++;
2174
2175
1.13G
      } while (!onlycpdrule && simplifiedcpd &&
2176
1.13G
               scpd <= checkcpdtable.size());  // end of simplifiedcpd loop
2177
2178
1.23G
      scpd = 0;
2179
1.23G
      wordnum = oldwordnum;
2180
1.23G
      numsyllable = oldnumsyllable;
2181
2182
1.23G
      if (soldi != 0) {
2183
9.78k
        i = soldi;
2184
9.78k
        st.assign(word);  // XXX add more optim.
2185
9.78k
        soldi = 0;
2186
9.78k
        len = oldlen;
2187
9.78k
        cmin = oldcmin;
2188
9.78k
        cmax = oldcmax;
2189
9.78k
      } else
2190
1.23G
        st[i] = ch;
2191
2192
1.23G
    } while (!defcpdtable.empty() && oldwordnum == 0 &&
2193
1.23G
             onlycpdrule++ < 1);  // end of onlycpd loop
2194
1.14G
  }
2195
2196
28.9M
  return NULL;
2197
30.7M
}
2198
2199
// check if compound word is correctly spelled
2200
// hu_mov_rule = spec. Hungarian rule (XXX)
2201
int AffixMgr::compound_check_morph(const std::string& word,
2202
                                   short wordnum,
2203
                                   short numsyllable,
2204
                                   short maxwordnum,
2205
                                   short wnum,
2206
                                   hentry** words,
2207
                                   hentry** rwords,
2208
                                   char hu_mov_rule,
2209
                                   std::string& result,
2210
0
                                   const std::string* partresult) {
2211
0
  short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2212
0
  hentry* rv = NULL, *rv_first;
2213
0
  std::string st, presult;
2214
0
  char ch, affixed = 0;
2215
0
  int checked_prefix, ok = 0;
2216
0
  size_t cmin, cmax;
2217
0
  hentry** oldwords = words;
2218
0
  size_t len = word.size();
2219
2220
  // add a time limit to handle possible
2221
  // combinatorical explosion of the overlapping words
2222
2223
0
  HUNSPELL_THREAD_LOCAL std::chrono::steady_clock::time_point clock_time_start;
2224
0
  HUNSPELL_THREAD_LOCAL bool timelimit_exceeded;
2225
2226
  // get the current time
2227
0
  std::chrono::steady_clock::time_point clock_now = std::chrono::steady_clock::now();
2228
2229
0
  if (wordnum == 0) {
2230
      // set the start time
2231
0
      clock_time_start = clock_now;
2232
0
      timelimit_exceeded = false;
2233
0
  }
2234
0
  else if (std::chrono::duration_cast<std::chrono::milliseconds>(clock_now - clock_time_start).count()
2235
0
            > static_cast<double>(TIMELIMIT) * CLOCKS_PER_SEC * 1000)
2236
0
      timelimit_exceeded = true;
2237
2238
0
  setcminmax(&cmin, &cmax, word.c_str(), len);
2239
2240
0
  st.assign(word);
2241
2242
0
  for (size_t i = cmin; i < cmax; ++i) {
2243
    // go to end of the UTF-8 character
2244
0
    if (utf8) {
2245
0
      for (; (st[i] & 0xc0) == 0x80; i++)
2246
0
        ;
2247
0
      if (i >= cmax)
2248
0
        return 0;
2249
0
    }
2250
2251
0
    words = oldwords;
2252
0
    int onlycpdrule = (words) ? 1 : 0;
2253
2254
0
    do {  // onlycpdrule loop
2255
2256
0
      if (timelimit_exceeded)
2257
0
        return 0;
2258
2259
0
      oldnumsyllable = numsyllable;
2260
0
      oldwordnum = wordnum;
2261
0
      checked_prefix = 0;
2262
2263
0
      ch = st[i];
2264
0
      st[i] = '\0';
2265
0
      sfx = NULL;
2266
2267
      // FIRST WORD
2268
2269
0
      affixed = 1;
2270
2271
0
      presult.clear();
2272
0
      if (partresult)
2273
0
        presult.append(*partresult);
2274
2275
0
      rv = lookup(st.c_str(), i);  // perhaps without prefix
2276
2277
      // forbid dictionary stems with COMPOUNDFORBIDFLAG in
2278
      // compound words, overriding the effect of COMPOUNDPERMITFLAG
2279
0
      if ((rv) && compoundforbidflag &&
2280
0
              TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
2281
0
          continue;
2282
2283
      // search homonym with compound flag
2284
0
      while ((rv) && !hu_mov_rule &&
2285
0
             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2286
0
              !((compoundflag && !words && !onlycpdrule &&
2287
0
                 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2288
0
                (compoundbegin && !wordnum && !onlycpdrule &&
2289
0
                 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2290
0
                (compoundmiddle && wordnum && !words && !onlycpdrule &&
2291
0
                 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2292
0
                (!defcpdtable.empty() && onlycpdrule &&
2293
0
                 ((!words && !wordnum &&
2294
0
                   defcpd_check(&words, wnum, rv, rwords, 0)) ||
2295
0
                  (words &&
2296
0
                   defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2297
0
        rv = rv->next_homonym;
2298
0
      }
2299
2300
2301
0
      if (rv)
2302
0
        affixed = 0;
2303
2304
0
      if (rv) {
2305
0
        presult.push_back(MSEP_FLD);
2306
0
        presult.append(MORPH_PART);
2307
0
        presult.append(st, 0, i);
2308
0
        if (!HENTRY_FIND(rv, MORPH_STEM)) {
2309
0
          presult.push_back(MSEP_FLD);
2310
0
          presult.append(MORPH_STEM);
2311
0
          presult.append(st, 0, i);
2312
0
        }
2313
0
        if (HENTRY_DATA(rv)) {
2314
0
          presult.push_back(MSEP_FLD);
2315
0
          presult.append(HENTRY_DATA2(rv));
2316
0
        }
2317
0
      }
2318
2319
0
      if (!rv) {
2320
0
        if (compoundflag &&
2321
0
            !(rv =
2322
0
                  prefix_check(st, 0, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2323
0
                               compoundflag))) {
2324
0
          if (((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL,
2325
0
                                  compoundflag,
2326
0
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2327
0
               (compoundmoresuffixes &&
2328
0
                (rv = suffix_check_twosfx(st, 0, i, 0, NULL, compoundflag)))) &&
2329
0
              !hu_mov_rule && sfx->getCont() &&
2330
0
              ((compoundforbidflag &&
2331
0
                TESTAFF(sfx->getCont(), compoundforbidflag,
2332
0
                        sfx->getContLen())) ||
2333
0
               (compoundend &&
2334
0
                TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2335
0
            rv = NULL;
2336
0
          }
2337
0
        }
2338
2339
0
        if (rv ||
2340
0
            (((wordnum == 0) && compoundbegin &&
2341
0
              ((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL,
2342
0
                                  compoundbegin,
2343
0
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2344
0
               (compoundmoresuffixes &&
2345
0
                (rv = suffix_check_twosfx(
2346
0
                     st, 0, i, 0, NULL,
2347
0
                     compoundbegin))) ||  // twofold suffix+compound
2348
0
               (rv = prefix_check(st, 0, i,
2349
0
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2350
0
                                  compoundbegin)))) ||
2351
0
             ((wordnum > 0) && compoundmiddle &&
2352
0
              ((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL,
2353
0
                                  compoundmiddle,
2354
0
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2355
0
               (compoundmoresuffixes &&
2356
0
                (rv = suffix_check_twosfx(
2357
0
                     st, 0, i, 0, NULL,
2358
0
                     compoundmiddle))) ||  // twofold suffix+compound
2359
0
               (rv = prefix_check(st, 0, i,
2360
0
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2361
0
                                  compoundmiddle)))))) {
2362
0
          std::string p;
2363
0
          if (compoundflag)
2364
0
            p = affix_check_morph(st, 0, i, compoundflag);
2365
0
          if (p.empty()) {
2366
0
            if ((wordnum == 0) && compoundbegin) {
2367
0
              p = affix_check_morph(st, 0, i, compoundbegin);
2368
0
            } else if ((wordnum > 0) && compoundmiddle) {
2369
0
              p = affix_check_morph(st, 0, i, compoundmiddle);
2370
0
            }
2371
0
          }
2372
0
          if (!p.empty()) {
2373
0
            presult.push_back(MSEP_FLD);
2374
0
            presult.append(MORPH_PART);
2375
0
            presult.append(st, 0, i);
2376
0
            line_uniq_app(p, MSEP_REC);
2377
0
            presult.append(p);
2378
0
          }
2379
0
          checked_prefix = 1;
2380
0
        }
2381
        // else check forbiddenwords
2382
0
      } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2383
0
                              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2384
0
                              TESTAFF(rv->astr, needaffix, rv->alen))) {
2385
0
        st[i] = ch;
2386
0
        continue;
2387
0
      }
2388
2389
      // check non_compound flag in suffix and prefix
2390
0
      if ((rv) && !hu_mov_rule &&
2391
0
          ((pfx && pfx->getCont() &&
2392
0
            TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2393
0
           (sfx && sfx->getCont() &&
2394
0
            TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
2395
0
        continue;
2396
0
      }
2397
2398
      // check compoundend flag in suffix and prefix
2399
0
      if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2400
0
          ((pfx && pfx->getCont() &&
2401
0
            TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
2402
0
           (sfx && sfx->getCont() &&
2403
0
            TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2404
0
        continue;
2405
0
      }
2406
2407
      // check compoundmiddle flag in suffix and prefix
2408
0
      if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2409
0
          !hu_mov_rule &&
2410
0
          ((pfx && pfx->getCont() &&
2411
0
            TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
2412
0
           (sfx && sfx->getCont() &&
2413
0
            TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
2414
0
        rv = NULL;
2415
0
      }
2416
2417
      // check forbiddenwords
2418
0
      if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2419
0
                                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
2420
0
        continue;
2421
2422
      // increment word number, if the second root has a compoundroot flag
2423
0
      if ((rv) && (compoundroot) &&
2424
0
          (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2425
0
        wordnum++;
2426
0
      }
2427
2428
      // first word is acceptable in compound words?
2429
0
      if (((rv) &&
2430
0
           (checked_prefix || (words && words[wnum]) ||
2431
0
            (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2432
0
            ((oldwordnum == 0) && compoundbegin &&
2433
0
             TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2434
0
            ((oldwordnum > 0) && compoundmiddle &&
2435
0
             TESTAFF(rv->astr, compoundmiddle, rv->alen))
2436
            // LANG_hu section: spec. Hungarian rule
2437
0
            || ((langnum == LANG_hu) &&  // hu_mov_rule
2438
0
                hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
2439
0
                                TESTAFF(rv->astr, 'G', rv->alen) ||
2440
0
                                TESTAFF(rv->astr, 'H', rv->alen)))
2441
            // END of LANG_hu section
2442
0
            ) &&
2443
0
           !((checkcompoundtriple && !words &&  // test triple letters
2444
0
              (word[i - 1] == word[i]) &&
2445
0
              (((i > 1) && (word[i - 1] == word[i - 2])) ||
2446
0
               ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
2447
0
               )) ||
2448
0
             (
2449
                 // test CHECKCOMPOUNDPATTERN
2450
0
                 !checkcpdtable.empty() && !words &&
2451
0
                 cpdpat_check(word, i, rv, NULL, affixed)) ||
2452
0
             (checkcompoundcase && !words && cpdcase_check(word, i))))
2453
          // LANG_hu section: spec. Hungarian rule
2454
0
          ||
2455
0
          ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2456
0
           (rv = affix_check(st, 0, i)) &&
2457
0
           (sfx && sfx->getCont() &&
2458
0
            (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
2459
0
             TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
2460
          // END of LANG_hu section
2461
0
          ) {
2462
        // LANG_hu section: spec. Hungarian rule
2463
0
        if (langnum == LANG_hu) {
2464
          // calculate syllable number of the word
2465
0
          numsyllable += get_syllable(st.substr(0, i));
2466
2467
          // + 1 word, if syllable number of the prefix > 1 (hungarian
2468
          // convention)
2469
0
          if (pfx && (get_syllable(pfx->getKey()) > 1))
2470
0
            wordnum++;
2471
0
        }
2472
        // END of LANG_hu section
2473
2474
        // NEXT WORD(S)
2475
0
        rv_first = rv;
2476
0
        rv = lookup(word.c_str() + i, word.size() - i);  // perhaps without prefix
2477
2478
        // search homonym with compound flag
2479
0
        while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2480
0
                        !((compoundflag && !words &&
2481
0
                           TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2482
0
                          (compoundend && !words &&
2483
0
                           TESTAFF(rv->astr, compoundend, rv->alen)) ||
2484
0
                          (!defcpdtable.empty() && words &&
2485
0
                           defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
2486
0
          rv = rv->next_homonym;
2487
0
        }
2488
2489
0
        if (rv && words && words[wnum + 1]) {
2490
0
          result.append(presult);
2491
0
          result.push_back(MSEP_FLD);
2492
0
          result.append(MORPH_PART);
2493
0
          result.append(word, i, word.size());
2494
0
          if (complexprefixes && HENTRY_DATA(rv))
2495
0
            result.append(HENTRY_DATA2(rv));
2496
0
          if (!HENTRY_FIND(rv, MORPH_STEM)) {
2497
0
            result.push_back(MSEP_FLD);
2498
0
            result.append(MORPH_STEM);
2499
0
            result.append(HENTRY_WORD(rv));
2500
0
          }
2501
          // store the pointer of the hash entry
2502
0
          if (!complexprefixes && HENTRY_DATA(rv)) {
2503
0
            result.push_back(MSEP_FLD);
2504
0
            result.append(HENTRY_DATA2(rv));
2505
0
          }
2506
0
          result.push_back(MSEP_REC);
2507
0
          return 0;
2508
0
        }
2509
2510
0
        oldnumsyllable2 = numsyllable;
2511
0
        oldwordnum2 = wordnum;
2512
2513
        // LANG_hu section: spec. Hungarian rule
2514
0
        if ((rv) && (langnum == LANG_hu) &&
2515
0
            (TESTAFF(rv->astr, 'I', rv->alen)) &&
2516
0
            !(TESTAFF(rv->astr, 'J', rv->alen))) {
2517
0
          numsyllable--;
2518
0
        }
2519
        // END of LANG_hu section
2520
        // increment word number, if the second root has a compoundroot flag
2521
0
        if ((rv) && (compoundroot) &&
2522
0
            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2523
0
          wordnum++;
2524
0
        }
2525
2526
        // check forbiddenwords
2527
0
        if ((rv) && (rv->astr) &&
2528
0
            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2529
0
             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2530
0
          st[i] = ch;
2531
0
          continue;
2532
0
        }
2533
2534
        // second word is acceptable, as a root?
2535
        // hungarian conventions: compounding is acceptable,
2536
        // when compound forms consist of 2 words, or if more,
2537
        // then the syllable number of root words must be 6, or lesser.
2538
0
        if ((rv) &&
2539
0
            ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2540
0
             (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
2541
0
            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2542
0
             ((cpdmaxsyllable != 0) &&
2543
0
              (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
2544
0
               cpdmaxsyllable))) &&
2545
0
            ((!checkcompounddup || (rv != rv_first)))) {
2546
          // bad compound word
2547
0
          result.append(presult);
2548
0
          result.push_back(MSEP_FLD);
2549
0
          result.append(MORPH_PART);
2550
0
          result.append(word, i, word.size());
2551
2552
0
          if (HENTRY_DATA(rv)) {
2553
0
            if (complexprefixes)
2554
0
              result.append(HENTRY_DATA2(rv));
2555
0
            if (!HENTRY_FIND(rv, MORPH_STEM)) {
2556
0
              result.push_back(MSEP_FLD);
2557
0
              result.append(MORPH_STEM);
2558
0
              result.append(HENTRY_WORD(rv));
2559
0
            }
2560
            // store the pointer of the hash entry
2561
0
            if (!complexprefixes) {
2562
0
              result.push_back(MSEP_FLD);
2563
0
              result.append(HENTRY_DATA2(rv));
2564
0
            }
2565
0
          }
2566
0
          result.push_back(MSEP_REC);
2567
0
          ok = 1;
2568
0
        }
2569
2570
0
        numsyllable = oldnumsyllable2;
2571
0
        wordnum = oldwordnum2;
2572
2573
        // perhaps second word has prefix or/and suffix
2574
0
        sfx = NULL;
2575
0
        sfxflag = FLAG_NULL;
2576
2577
0
        if (compoundflag && !onlycpdrule)
2578
0
          rv = affix_check(word, i, word.size() - i, compoundflag);
2579
0
        else
2580
0
          rv = NULL;
2581
2582
0
        if (!rv && compoundend && !onlycpdrule) {
2583
0
          sfx = NULL;
2584
0
          pfx = NULL;
2585
0
          rv = affix_check(word, i, word.size() - i, compoundend);
2586
0
        }
2587
2588
0
        if (!rv && !defcpdtable.empty() && words) {
2589
0
          rv = affix_check(word, i, word.size() - i, 0, IN_CPD_END);
2590
0
          if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2591
0
            std::string m;
2592
0
            if (compoundflag)
2593
0
              m = affix_check_morph(word, i, word.size() - i, compoundflag);
2594
0
            if (m.empty() && compoundend) {
2595
0
              m = affix_check_morph(word, i, word.size() - i, compoundend);
2596
0
            }
2597
0
            result.append(presult);
2598
0
            if (!m.empty()) {
2599
0
              result.push_back(MSEP_FLD);
2600
0
              result.append(MORPH_PART);
2601
0
              result.append(word, i, word.size());
2602
0
              line_uniq_app(m, MSEP_REC);
2603
0
              result.append(m);
2604
0
            }
2605
0
            result.push_back(MSEP_REC);
2606
0
            ok = 1;
2607
0
          }
2608
0
        }
2609
2610
        // check non_compound flag in suffix and prefix
2611
0
        if ((rv) &&
2612
0
            ((pfx && pfx->getCont() &&
2613
0
              TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2614
0
             (sfx && sfx->getCont() &&
2615
0
              TESTAFF(sfx->getCont(), compoundforbidflag,
2616
0
                      sfx->getContLen())))) {
2617
0
          rv = NULL;
2618
0
        }
2619
2620
        // check forbiddenwords
2621
0
        if ((rv) && (rv->astr) &&
2622
0
            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2623
0
             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
2624
0
            (!TESTAFF(rv->astr, needaffix, rv->alen))) {
2625
0
          st[i] = ch;
2626
0
          continue;
2627
0
        }
2628
2629
0
        if (langnum == LANG_hu) {
2630
          // calculate syllable number of the word
2631
0
          numsyllable += get_syllable(word.c_str() + i);
2632
2633
          // - affix syllable num.
2634
          // XXX only second suffix (inflections, not derivations)
2635
0
          if (sfxappnd) {
2636
0
            std::string tmp(sfxappnd);
2637
0
            reverseword(tmp);
2638
0
            numsyllable -= short(get_syllable(tmp) + sfxextra);
2639
0
          } else {
2640
0
            numsyllable -= short(sfxextra);
2641
0
          }
2642
2643
          // + 1 word, if syllable number of the prefix > 1 (hungarian
2644
          // convention)
2645
0
          if (pfx && (get_syllable(pfx->getKey()) > 1))
2646
0
            wordnum++;
2647
2648
          // increment syllable num, if last word has a SYLLABLENUM flag
2649
          // and the suffix is beginning `s'
2650
2651
0
          if (!cpdsyllablenum.empty()) {
2652
0
            switch (sfxflag) {
2653
0
              case 'c': {
2654
0
                numsyllable += 2;
2655
0
                break;
2656
0
              }
2657
0
              case 'J': {
2658
0
                numsyllable += 1;
2659
0
                break;
2660
0
              }
2661
0
              case 'I': {
2662
0
                if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2663
0
                  numsyllable += 1;
2664
0
                break;
2665
0
              }
2666
0
            }
2667
0
          }
2668
0
        }
2669
2670
        // increment word number, if the second word has a compoundroot flag
2671
0
        if ((rv) && (compoundroot) &&
2672
0
            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2673
0
          wordnum++;
2674
0
        }
2675
        // second word is acceptable, as a word with prefix or/and suffix?
2676
        // hungarian conventions: compounding is acceptable,
2677
        // when compound forms consist 2 word, otherwise
2678
        // the syllable number of root words is 6, or lesser.
2679
0
        if ((rv) &&
2680
0
            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2681
0
             ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2682
0
            ((!checkcompounddup || (rv != rv_first)))) {
2683
0
          std::string m;
2684
0
          if (compoundflag)
2685
0
            m = affix_check_morph(word, i, word.size() - i, compoundflag);
2686
0
          if (m.empty() && compoundend) {
2687
0
            m = affix_check_morph(word, i, word.size() - i, compoundend);
2688
0
          }
2689
0
          result.append(presult);
2690
0
          if (!m.empty()) {
2691
0
            result.push_back(MSEP_FLD);
2692
0
            result.append(MORPH_PART);
2693
0
            result.append(word, i, word.size());
2694
0
            line_uniq_app(m, MSEP_REC);
2695
0
            result.push_back(MSEP_FLD);
2696
0
            result.append(m);
2697
0
          }
2698
0
          result.push_back(MSEP_REC);
2699
0
          ok = 1;
2700
0
        }
2701
2702
0
        numsyllable = oldnumsyllable2;
2703
0
        wordnum = oldwordnum2;
2704
2705
        // perhaps second word is a compound word (recursive call)
2706
0
        if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2707
0
          compound_check_morph(word.substr(i), wordnum + 1,
2708
0
                               numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2709
0
                               result, &presult);
2710
0
        } else {
2711
0
          rv = NULL;
2712
0
        }
2713
0
      }
2714
0
      st[i] = ch;
2715
0
      wordnum = oldwordnum;
2716
0
      numsyllable = oldnumsyllable;
2717
2718
0
    } while (!defcpdtable.empty() && oldwordnum == 0 &&
2719
0
             onlycpdrule++ < 1);  // end of onlycpd loop
2720
0
  }
2721
0
  return 0;
2722
0
}
2723
2724
2725
inline int AffixMgr::isRevSubset(const char* s1,
2726
                                 const char* end_of_s2,
2727
96.2M
                                 int len) {
2728
120M
  while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2729
24.2M
    s1++;
2730
24.2M
    end_of_s2--;
2731
24.2M
    len--;
2732
24.2M
  }
2733
96.2M
  return (*s1 == '\0');
2734
96.2M
}
2735
2736
// check word for suffixes
2737
struct hentry* AffixMgr::suffix_check(const std::string& word,
2738
                                      int start,
2739
                                      int len,
2740
                                      int sfxopts,
2741
                                      PfxEntry* ppfx,
2742
                                      const FLAG cclass,
2743
                                      const FLAG needflag,
2744
1.41G
                                      char in_compound) {
2745
1.41G
  struct hentry* rv = NULL;
2746
1.41G
  PfxEntry* ep = ppfx;
2747
2748
  // first handle the special case of 0 length suffixes
2749
1.41G
  SfxEntry* se = sStart[0];
2750
2751
2.22G
  while (se) {
2752
806M
    if (!cclass || se->getCont()) {
2753
      // suffixes are not allowed in beginning of compounds
2754
765M
      if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2755
           // except when signed with compoundpermitflag flag
2756
765M
           (se->getCont() && compoundpermitflag &&
2757
383M
            TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2758
765M
          (!circumfix ||
2759
           // no circumfix flag in prefix and suffix
2760
387M
           ((!ppfx || !(ep->getCont()) ||
2761
21.8M
             !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2762
21.8M
            (!se->getCont() ||
2763
21.7M
             !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2764
           // circumfix flag in prefix AND suffix
2765
387M
           ((ppfx && (ep->getCont()) &&
2766
307k
             TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2767
307k
            (se->getCont() &&
2768
48.8k
             (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2769
          // fogemorpheme
2770
765M
          (in_compound ||
2771
387M
           !(se->getCont() &&
2772
372M
             (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2773
          // needaffix on prefix or first suffix
2774
765M
          (cclass ||
2775
347M
           !(se->getCont() &&
2776
281M
             TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2777
347M
           (ppfx &&
2778
30.1M
            !((ep->getCont()) &&
2779
317M
              TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
2780
317M
        rv = se->checkword(word, start, len, sfxopts, ppfx,
2781
317M
                           (FLAG)cclass, needflag,
2782
317M
                           (in_compound ? 0 : onlyincompound));
2783
317M
        if (rv) {
2784
322k
          sfx = se;  // BUG: sfx not stateless
2785
322k
          return rv;
2786
322k
        }
2787
317M
      }
2788
765M
    }
2789
806M
    se = se->getNext();
2790
806M
  }
2791
2792
  // now handle the general case
2793
1.41G
  if (len == 0)
2794
46.5k
    return NULL;  // FULLSTRIP
2795
1.41G
  unsigned char sp = word[start + len - 1];
2796
1.41G
  SfxEntry* sptr = sStart[sp];
2797
2798
1.49G
  while (sptr) {
2799
83.8M
    if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) {
2800
      // suffixes are not allowed in beginning of compounds
2801
82.4M
      if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2802
           // except when signed with compoundpermitflag flag
2803
82.4M
           (sptr->getCont() && compoundpermitflag &&
2804
43.7M
            TESTAFF(sptr->getCont(), compoundpermitflag,
2805
43.7M
                    sptr->getContLen()))) &&
2806
82.4M
          (!circumfix ||
2807
           // no circumfix flag in prefix and suffix
2808
40.1M
           ((!ppfx || !(ep->getCont()) ||
2809
3.92M
             !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2810
3.92M
            (!sptr->getCont() ||
2811
3.91M
             !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2812
           // circumfix flag in prefix AND suffix
2813
40.1M
           ((ppfx && (ep->getCont()) &&
2814
91.2k
             TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2815
91.2k
            (sptr->getCont() &&
2816
11.3k
             (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2817
          // fogemorpheme
2818
82.4M
          (in_compound ||
2819
40.0M
           !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2820
15.6M
                                          sptr->getContLen()))))) &&
2821
          // needaffix on prefix or first suffix
2822
82.4M
          (cclass ||
2823
39.7M
           !(sptr->getCont() &&
2824
28.8M
             TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2825
39.7M
           (ppfx &&
2826
3.36M
            !((ep->getCont()) &&
2827
151k
              TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
2828
36.5M
        if (in_compound != IN_CPD_END || ppfx ||
2829
36.5M
            !(sptr->getCont() &&
2830
36.5M
              TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2831
36.5M
          rv = sptr->checkword(word, start, len, sfxopts, ppfx,
2832
36.5M
                               cclass, needflag,
2833
36.5M
                               (in_compound ? 0 : onlyincompound));
2834
36.5M
          if (rv) {
2835
39.4k
            sfx = sptr;                 // BUG: sfx not stateless
2836
39.4k
            sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2837
39.4k
            if (!sptr->getCont())
2838
12.8k
              sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2839
            // LANG_hu section: spec. Hungarian rule
2840
26.6k
            else if (langnum == LANG_hu && sptr->getKeyLen() &&
2841
26.6k
                     sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2842
26.6k
                     sptr->getKey()[1] != 't') {
2843
0
              sfxextra = 1;
2844
0
            }
2845
            // END of LANG_hu section
2846
39.4k
            return rv;
2847
39.4k
          }
2848
36.5M
        }
2849
82.4M
      sptr = sptr->getNextEQ();
2850
82.4M
    } else {
2851
1.38M
      sptr = sptr->getNextNE();
2852
1.38M
    }
2853
83.8M
  }
2854
2855
1.41G
  return NULL;
2856
1.41G
}
2857
2858
// check word for two-level suffixes
2859
struct hentry* AffixMgr::suffix_check_twosfx(const std::string& word,
2860
                                             int start,
2861
                                             int len,
2862
                                             int sfxopts,
2863
                                             PfxEntry* ppfx,
2864
87.9M
                                             const FLAG needflag) {
2865
87.9M
  struct hentry* rv = NULL;
2866
2867
  // first handle the special case of 0 length suffixes
2868
87.9M
  SfxEntry* se = sStart[0];
2869
188M
  while (se) {
2870
100M
    if (contclasses[se->getFlag()]) {
2871
82.1M
      rv = se->check_twosfx(word, start, len, sfxopts, ppfx, needflag);
2872
82.1M
      if (rv)
2873
245k
        return rv;
2874
82.1M
    }
2875
100M
    se = se->getNext();
2876
100M
  }
2877
2878
  // now handle the general case
2879
87.6M
  if (len == 0)
2880
12.9k
    return NULL;  // FULLSTRIP
2881
87.6M
  unsigned char sp = word[start + len - 1];
2882
87.6M
  SfxEntry* sptr = sStart[sp];
2883
2884
99.9M
  while (sptr) {
2885
12.3M
    if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) {
2886
12.1M
      if (contclasses[sptr->getFlag()]) {
2887
9.47M
        rv = sptr->check_twosfx(word, start, len, sfxopts, ppfx, needflag);
2888
9.47M
        if (rv) {
2889
9.12k
          sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2890
9.12k
          if (!sptr->getCont())
2891
3.89k
            sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2892
9.12k
          return rv;
2893
9.12k
        }
2894
9.47M
      }
2895
12.1M
      sptr = sptr->getNextEQ();
2896
12.1M
    } else {
2897
173k
      sptr = sptr->getNextNE();
2898
173k
    }
2899
12.3M
  }
2900
2901
87.6M
  return NULL;
2902
87.6M
}
2903
2904
// check word for two-level suffixes and morph
2905
std::string AffixMgr::suffix_check_twosfx_morph(const std::string& word,
2906
                                                int start,
2907
                                                int len,
2908
                                                int sfxopts,
2909
                                                PfxEntry* ppfx,
2910
0
                                                const FLAG needflag) {
2911
0
  std::string result;
2912
0
  std::string result2;
2913
0
  std::string result3;
2914
2915
  // first handle the special case of 0 length suffixes
2916
0
  SfxEntry* se = sStart[0];
2917
0
  while (se) {
2918
0
    if (contclasses[se->getFlag()]) {
2919
0
      std::string st = se->check_twosfx_morph(word, start, len, sfxopts, ppfx, needflag);
2920
0
      if (!st.empty()) {
2921
0
        if (ppfx) {
2922
0
          if (ppfx->getMorph()) {
2923
0
            result.append(ppfx->getMorph());
2924
0
            result.push_back(MSEP_FLD);
2925
0
          } else
2926
0
            debugflag(result, ppfx->getFlag());
2927
0
        }
2928
0
        result.append(st);
2929
0
        if (se->getMorph()) {
2930
0
          result.push_back(MSEP_FLD);
2931
0
          result.append(se->getMorph());
2932
0
        } else
2933
0
          debugflag(result, se->getFlag());
2934
0
        result.push_back(MSEP_REC);
2935
0
      }
2936
0
    }
2937
0
    se = se->getNext();
2938
0
  }
2939
2940
  // now handle the general case
2941
0
  if (len == 0)
2942
0
    return { };  // FULLSTRIP
2943
0
  unsigned char sp = word[start + len - 1];
2944
0
  SfxEntry* sptr = sStart[sp];
2945
2946
0
  while (sptr) {
2947
0
    if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) {
2948
0
      if (contclasses[sptr->getFlag()]) {
2949
0
        std::string st = sptr->check_twosfx_morph(word, start, len, sfxopts, ppfx, needflag);
2950
0
        if (!st.empty()) {
2951
0
          sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2952
0
          if (!sptr->getCont())
2953
0
            sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2954
0
          result2.assign(st);
2955
2956
0
          result3.clear();
2957
2958
0
          if (sptr->getMorph()) {
2959
0
            result3.push_back(MSEP_FLD);
2960
0
            result3.append(sptr->getMorph());
2961
0
          } else
2962
0
            debugflag(result3, sptr->getFlag());
2963
0
          strlinecat(result2, result3);
2964
0
          result2.push_back(MSEP_REC);
2965
0
          result.append(result2);
2966
0
        }
2967
0
      }
2968
0
      sptr = sptr->getNextEQ();
2969
0
    } else {
2970
0
      sptr = sptr->getNextNE();
2971
0
    }
2972
0
  }
2973
2974
0
  return result;
2975
0
}
2976
2977
std::string AffixMgr::suffix_check_morph(const std::string& word,
2978
                                         int start,
2979
                                         int len,
2980
                                         int sfxopts,
2981
                                         PfxEntry* ppfx,
2982
                                         const FLAG cclass,
2983
                                         const FLAG needflag,
2984
0
                                         char in_compound) {
2985
0
  std::string result;
2986
2987
0
  struct hentry* rv = NULL;
2988
2989
0
  PfxEntry* ep = ppfx;
2990
2991
  // first handle the special case of 0 length suffixes
2992
0
  SfxEntry* se = sStart[0];
2993
0
  while (se) {
2994
0
    if (!cclass || se->getCont()) {
2995
      // suffixes are not allowed in beginning of compounds
2996
0
      if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2997
            // except when signed with compoundpermitflag flag
2998
0
            (se->getCont() && compoundpermitflag &&
2999
0
             TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
3000
0
           (!circumfix ||
3001
            // no circumfix flag in prefix and suffix
3002
0
            ((!ppfx || !(ep->getCont()) ||
3003
0
              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3004
0
             (!se->getCont() ||
3005
0
              !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
3006
            // circumfix flag in prefix AND suffix
3007
0
            ((ppfx && (ep->getCont()) &&
3008
0
              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3009
0
             (se->getCont() &&
3010
0
              (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
3011
           // fogemorpheme
3012
0
           (in_compound ||
3013
0
            !((se->getCont() &&
3014
0
               (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
3015
           // needaffix on prefix or first suffix
3016
0
           (cclass ||
3017
0
            !(se->getCont() &&
3018
0
              TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
3019
0
            (ppfx &&
3020
0
             !((ep->getCont()) &&
3021
0
               TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
3022
0
        rv = se->checkword(word, start, len, sfxopts, ppfx, cclass,
3023
0
                           needflag, FLAG_NULL);
3024
0
      while (rv) {
3025
0
        if (ppfx) {
3026
0
          if (ppfx->getMorph()) {
3027
0
            result.append(ppfx->getMorph());
3028
0
            result.push_back(MSEP_FLD);
3029
0
          } else
3030
0
            debugflag(result, ppfx->getFlag());
3031
0
        }
3032
0
        if (complexprefixes && HENTRY_DATA(rv))
3033
0
          result.append(HENTRY_DATA2(rv));
3034
0
        if (!HENTRY_FIND(rv, MORPH_STEM)) {
3035
0
          result.push_back(MSEP_FLD);
3036
0
          result.append(MORPH_STEM);
3037
0
          result.append(HENTRY_WORD(rv));
3038
0
        }
3039
3040
0
        if (!complexprefixes && HENTRY_DATA(rv)) {
3041
0
          result.push_back(MSEP_FLD);
3042
0
          result.append(HENTRY_DATA2(rv));
3043
0
        }
3044
0
        if (se->getMorph()) {
3045
0
          result.push_back(MSEP_FLD);
3046
0
          result.append(se->getMorph());
3047
0
        } else
3048
0
          debugflag(result, se->getFlag());
3049
0
        result.push_back(MSEP_REC);
3050
0
        rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3051
0
      }
3052
0
    }
3053
0
    se = se->getNext();
3054
0
  }
3055
3056
  // now handle the general case
3057
0
  if (len == 0)
3058
0
    return { };  // FULLSTRIP
3059
0
  unsigned char sp = word[start + len - 1];
3060
0
  SfxEntry* sptr = sStart[sp];
3061
3062
0
  while (sptr) {
3063
0
    if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) {
3064
      // suffixes are not allowed in beginning of compounds
3065
0
      if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
3066
            // except when signed with compoundpermitflag flag
3067
0
            (sptr->getCont() && compoundpermitflag &&
3068
0
             TESTAFF(sptr->getCont(), compoundpermitflag,
3069
0
                     sptr->getContLen()))) &&
3070
0
           (!circumfix ||
3071
            // no circumfix flag in prefix and suffix
3072
0
            ((!ppfx || !(ep->getCont()) ||
3073
0
              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3074
0
             (!sptr->getCont() ||
3075
0
              !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
3076
            // circumfix flag in prefix AND suffix
3077
0
            ((ppfx && (ep->getCont()) &&
3078
0
              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3079
0
             (sptr->getCont() &&
3080
0
              (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
3081
           // fogemorpheme
3082
0
           (in_compound ||
3083
0
            !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
3084
0
                                           sptr->getContLen()))))) &&
3085
           // needaffix on first suffix
3086
0
           (cclass ||
3087
0
            !(sptr->getCont() &&
3088
0
              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
3089
0
        rv = sptr->checkword(word, start, len, sfxopts, ppfx, cclass,
3090
0
                             needflag, FLAG_NULL);
3091
0
      while (rv) {
3092
0
        if (ppfx) {
3093
0
          if (ppfx->getMorph()) {
3094
0
            result.append(ppfx->getMorph());
3095
0
            result.push_back(MSEP_FLD);
3096
0
          } else
3097
0
            debugflag(result, ppfx->getFlag());
3098
0
        }
3099
0
        if (complexprefixes && HENTRY_DATA(rv))
3100
0
          result.append(HENTRY_DATA2(rv));
3101
0
        if (!HENTRY_FIND(rv, MORPH_STEM)) {
3102
0
          result.push_back(MSEP_FLD);
3103
0
          result.append(MORPH_STEM);
3104
0
          result.append(HENTRY_WORD(rv));
3105
0
        }
3106
3107
0
        if (!complexprefixes && HENTRY_DATA(rv)) {
3108
0
          result.push_back(MSEP_FLD);
3109
0
          result.append(HENTRY_DATA2(rv));
3110
0
        }
3111
3112
0
        if (sptr->getMorph()) {
3113
0
          result.push_back(MSEP_FLD);
3114
0
          result.append(sptr->getMorph());
3115
0
        } else
3116
0
          debugflag(result, sptr->getFlag());
3117
0
        result.push_back(MSEP_REC);
3118
0
        rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3119
0
      }
3120
0
      sptr = sptr->getNextEQ();
3121
0
    } else {
3122
0
      sptr = sptr->getNextNE();
3123
0
    }
3124
0
  }
3125
3126
0
  return result;
3127
0
}
3128
3129
// check if word with affixes is correctly spelled
3130
struct hentry* AffixMgr::affix_check(const std::string& word,
3131
                                     int start,
3132
                                     int len,
3133
                                     const FLAG needflag,
3134
41.9M
                                     char in_compound) {
3135
3136
  // check all prefixes (also crossed with suffixes if allowed)
3137
41.9M
  struct hentry* rv = prefix_check(word, start, len, in_compound, needflag);
3138
41.9M
  if (rv)
3139
161k
    return rv;
3140
3141
  // if still not found check all suffixes
3142
41.7M
  rv = suffix_check(word, start, len, 0, NULL, FLAG_NULL, needflag, in_compound);
3143
3144
41.7M
  if (havecontclass) {
3145
15.0M
    sfx = NULL;
3146
15.0M
    pfx = NULL;
3147
3148
15.0M
    if (rv)
3149
38.9k
      return rv;
3150
    // if still not found check all two-level suffixes
3151
15.0M
    rv = suffix_check_twosfx(word, start, len, 0, NULL, needflag);
3152
3153
15.0M
    if (rv)
3154
10.3k
      return rv;
3155
    // if still not found check all two-level suffixes
3156
15.0M
    rv = prefix_check_twosfx(word, start, len, IN_CPD_NOT, needflag);
3157
15.0M
  }
3158
3159
41.7M
  return rv;
3160
41.7M
}
3161
3162
// check if word with affixes is correctly spelled
3163
std::string AffixMgr::affix_check_morph(const std::string& word,
3164
                                  int start,
3165
                                  int len,
3166
                                  const FLAG needflag,
3167
0
                                  char in_compound) {
3168
0
  std::string result;
3169
3170
  // check all prefixes (also crossed with suffixes if allowed)
3171
0
  std::string st = prefix_check_morph(word, start, len, in_compound);
3172
0
  if (!st.empty()) {
3173
0
    result.append(st);
3174
0
  }
3175
3176
  // if still not found check all suffixes
3177
0
  st = suffix_check_morph(word, start, len, 0, NULL, '\0', needflag, in_compound);
3178
0
  if (!st.empty()) {
3179
0
    result.append(st);
3180
0
  }
3181
3182
0
  if (havecontclass) {
3183
0
    sfx = NULL;
3184
0
    pfx = NULL;
3185
    // if still not found check all two-level suffixes
3186
0
    st = suffix_check_twosfx_morph(word, start, len, 0, NULL, needflag);
3187
0
    if (!st.empty()) {
3188
0
      result.append(st);
3189
0
    }
3190
3191
    // if still not found check all two-level suffixes
3192
0
    st = prefix_check_twosfx_morph(word, start, len, IN_CPD_NOT, needflag);
3193
0
    if (!st.empty()) {
3194
0
      result.append(st);
3195
0
    }
3196
0
  }
3197
3198
0
  return result;
3199
0
}
3200
3201
// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3202
// in the first line of the inputs
3203
// return 0, if inputs equal
3204
// return 1, if inputs may equal with a secondary suffix
3205
// otherwise return -1
3206
0
static int morphcmp(const char* s, const char* t) {
3207
0
  int se = 0, te = 0;
3208
0
  const char* sl;
3209
0
  const char* tl;
3210
0
  const char* olds;
3211
0
  const char* oldt;
3212
0
  if (!s || !t)
3213
0
    return 1;
3214
0
  olds = s;
3215
0
  sl = strchr(s, '\n');
3216
0
  s = strstr(s, MORPH_DERI_SFX);
3217
0
  if (!s || (sl && sl < s))
3218
0
    s = strstr(olds, MORPH_INFL_SFX);
3219
0
  if (!s || (sl && sl < s)) {
3220
0
    s = strstr(olds, MORPH_TERM_SFX);
3221
0
    olds = NULL;
3222
0
  }
3223
0
  oldt = t;
3224
0
  tl = strchr(t, '\n');
3225
0
  t = strstr(t, MORPH_DERI_SFX);
3226
0
  if (!t || (tl && tl < t))
3227
0
    t = strstr(oldt, MORPH_INFL_SFX);
3228
0
  if (!t || (tl && tl < t)) {
3229
0
    t = strstr(oldt, MORPH_TERM_SFX);
3230
0
    oldt = NULL;
3231
0
  }
3232
0
  while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3233
0
    s += MORPH_TAG_LEN;
3234
0
    t += MORPH_TAG_LEN;
3235
0
    se = 0;
3236
0
    te = 0;
3237
0
    while ((*s == *t) && !se && !te) {
3238
0
      s++;
3239
0
      t++;
3240
0
      switch (*s) {
3241
0
        case ' ':
3242
0
        case '\n':
3243
0
        case '\t':
3244
0
        case '\0':
3245
0
          se = 1;
3246
0
      }
3247
0
      switch (*t) {
3248
0
        case ' ':
3249
0
        case '\n':
3250
0
        case '\t':
3251
0
        case '\0':
3252
0
          te = 1;
3253
0
      }
3254
0
    }
3255
0
    if (!se || !te) {
3256
      // not terminal suffix difference
3257
0
      if (olds)
3258
0
        return -1;
3259
0
      return 1;
3260
0
    }
3261
0
    olds = s;
3262
0
    s = strstr(s, MORPH_DERI_SFX);
3263
0
    if (!s || (sl && sl < s))
3264
0
      s = strstr(olds, MORPH_INFL_SFX);
3265
0
    if (!s || (sl && sl < s)) {
3266
0
      s = strstr(olds, MORPH_TERM_SFX);
3267
0
      olds = NULL;
3268
0
    }
3269
0
    oldt = t;
3270
0
    t = strstr(t, MORPH_DERI_SFX);
3271
0
    if (!t || (tl && tl < t))
3272
0
      t = strstr(oldt, MORPH_INFL_SFX);
3273
0
    if (!t || (tl && tl < t)) {
3274
0
      t = strstr(oldt, MORPH_TERM_SFX);
3275
0
      oldt = NULL;
3276
0
    }
3277
0
  }
3278
0
  if (!s && !t && se && te)
3279
0
    return 0;
3280
0
  return 1;
3281
0
}
3282
3283
std::string AffixMgr::morphgen(const char* ts,
3284
                               int wl,
3285
                               const unsigned short* ap,
3286
                               unsigned short al,
3287
                               const char* morph,
3288
                               const char* targetmorph,
3289
0
                         int level) {
3290
  // handle suffixes
3291
0
  if (!morph)
3292
0
    return {};
3293
3294
  // check substandard flag
3295
0
  if (TESTAFF(ap, substandard, al))
3296
0
    return {};
3297
3298
0
  if (morphcmp(morph, targetmorph) == 0)
3299
0
    return ts;
3300
3301
0
  size_t stemmorphcatpos;
3302
0
  std::string mymorph;
3303
3304
  // use input suffix fields, if exist
3305
0
  if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3306
0
    mymorph.assign(morph);
3307
0
    mymorph.push_back(MSEP_FLD);
3308
0
    stemmorphcatpos = mymorph.size();
3309
0
  } else {
3310
0
    stemmorphcatpos = std::string::npos;
3311
0
  }
3312
3313
0
  for (int i = 0; i < al; i++) {
3314
0
    const auto c = (unsigned char)(ap[i] & 0x00FF);
3315
0
    SfxEntry* sptr = sFlag[c];
3316
0
    while (sptr) {
3317
0
      if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3318
0
          ((sptr->getContLen() == 0) ||
3319
           // don't generate forms with substandard affixes
3320
0
           !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3321
0
        const char* stemmorph;
3322
0
        if (stemmorphcatpos != std::string::npos) {
3323
0
          mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3324
0
          stemmorph = mymorph.c_str();
3325
0
        } else {
3326
0
          stemmorph = sptr->getMorph();
3327
0
        }
3328
3329
0
        int cmp = morphcmp(stemmorph, targetmorph);
3330
3331
0
        if (cmp == 0) {
3332
0
          std::string newword = sptr->add(ts, wl);
3333
0
          if (!newword.empty()) {
3334
0
            hentry* check = pHMgr->lookup(newword.c_str(), newword.size());  // XXX extra dic
3335
0
            if (!check || !check->astr ||
3336
0
                !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3337
0
                  TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3338
0
              return newword;
3339
0
            }
3340
0
          }
3341
0
        }
3342
3343
        // recursive call for secondary suffixes
3344
0
        if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3345
0
            !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3346
0
          std::string newword = sptr->add(ts, wl);
3347
0
          if (!newword.empty()) {
3348
0
            std::string newword2 =
3349
0
                morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3350
0
                         sptr->getContLen(), stemmorph, targetmorph, 1);
3351
3352
0
            if (!newword2.empty()) {
3353
0
              return newword2;
3354
0
            }
3355
0
          }
3356
0
        }
3357
0
      }
3358
0
      sptr = sptr->getFlgNxt();
3359
0
    }
3360
0
  }
3361
0
  return { };
3362
0
}
3363
3364
namespace {
3365
  // replaces strdup with ansi version
3366
1.31M
  char* mystrdup(const char* s) {
3367
1.31M
    char* d = NULL;
3368
1.31M
    if (s) {
3369
1.31M
      size_t sl = strlen(s) + 1;
3370
1.31M
      d = new char[sl];
3371
1.31M
      memcpy(d, s, sl);
3372
1.31M
    }
3373
1.31M
    return d;
3374
1.31M
  }
3375
}
3376
3377
int AffixMgr::expand_rootword(struct guessword* wlst,
3378
                              int maxn,
3379
                              const char* ts,
3380
                              int wl,
3381
                              const unsigned short* ap,
3382
                              unsigned short al,
3383
                              const char* bad,
3384
                              int badl,
3385
1.06M
                              const char* phon) {
3386
1.06M
  int nh = 0;
3387
  // first add root word to list
3388
1.06M
  if ((nh < maxn) &&
3389
1.06M
      !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3390
1.06M
               (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3391
1.06M
    wlst[nh].word = mystrdup(ts);
3392
1.06M
    wlst[nh].allow = false;
3393
1.06M
    wlst[nh].orig = NULL;
3394
1.06M
    nh++;
3395
    // add special phonetic version
3396
1.06M
    if (phon && (nh < maxn)) {
3397
78.9k
      wlst[nh].word = mystrdup(phon);
3398
78.9k
      wlst[nh].allow = false;
3399
78.9k
      wlst[nh].orig = mystrdup(ts);
3400
78.9k
      nh++;
3401
78.9k
    }
3402
1.06M
  }
3403
3404
  // handle suffixes
3405
1.97M
  for (int i = 0; i < al; i++) {
3406
910k
    const auto c = (unsigned char)(ap[i] & 0x00FF);
3407
910k
    SfxEntry* sptr = sFlag[c];
3408
1.05M
    while (sptr) {
3409
140k
      if ((sptr->getFlag() == ap[i]) &&
3410
140k
          (!sptr->getKeyLen() ||
3411
135k
           ((badl > sptr->getKeyLen()) &&
3412
64.9k
            (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3413
          // check needaffix flag
3414
140k
          !(sptr->getCont() &&
3415
72.0k
            ((needaffix &&
3416
61.2k
              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3417
61.2k
             (circumfix &&
3418
60.8k
              TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3419
61.2k
             (onlyincompound &&
3420
70.2k
              TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
3421
70.2k
        std::string newword = sptr->add(ts, wl);
3422
70.2k
        if (!newword.empty()) {
3423
27.1k
          if (nh < maxn) {
3424
26.7k
            wlst[nh].word = mystrdup(newword.c_str());
3425
26.7k
            wlst[nh].allow = sptr->allowCross();
3426
26.7k
            wlst[nh].orig = NULL;
3427
26.7k
            nh++;
3428
            // add special phonetic version
3429
26.7k
            if (phon && (nh < maxn)) {
3430
4.07k
              std::string prefix(phon);
3431
4.07k
              std::string key(sptr->getKey());
3432
4.07k
              reverseword(key);
3433
4.07k
              prefix.append(key);
3434
4.07k
              wlst[nh].word = mystrdup(prefix.c_str());
3435
4.07k
              wlst[nh].allow = false;
3436
4.07k
              wlst[nh].orig = mystrdup(newword.c_str());
3437
4.07k
              nh++;
3438
4.07k
            }
3439
26.7k
          }
3440
27.1k
        }
3441
70.2k
      }
3442
140k
      sptr = sptr->getFlgNxt();
3443
140k
    }
3444
910k
  }
3445
3446
1.06M
  int n = nh;
3447
3448
  // handle cross products of prefixes and suffixes
3449
1.17M
  for (int j = 1; j < n; j++)
3450
109k
    if (wlst[j].allow) {
3451
271k
      for (int k = 0; k < al; k++) {
3452
259k
        const auto c = (unsigned char)(ap[k] & 0x00FF);
3453
259k
        PfxEntry* cptr = pFlag[c];
3454
428k
        while (cptr) {
3455
169k
          if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3456
169k
              (!cptr->getKeyLen() ||
3457
64.4k
               ((badl > cptr->getKeyLen()) &&
3458
57.5k
                (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3459
57.5k
            int l1 = strlen(wlst[j].word);
3460
57.5k
            std::string newword = cptr->add(wlst[j].word, l1);
3461
57.5k
            if (!newword.empty()) {
3462
27.4k
              if (nh < maxn) {
3463
8.13k
                wlst[nh].word = mystrdup(newword.c_str());
3464
8.13k
                wlst[nh].allow = cptr->allowCross();
3465
8.13k
                wlst[nh].orig = NULL;
3466
8.13k
                nh++;
3467
8.13k
              }
3468
27.4k
            }
3469
57.5k
          }
3470
169k
          cptr = cptr->getFlgNxt();
3471
169k
        }
3472
259k
      }
3473
12.0k
    }
3474
3475
  // now handle pure prefixes
3476
1.97M
  for (int m = 0; m < al; m++) {
3477
910k
    const auto c = (unsigned char)(ap[m] & 0x00FF);
3478
910k
    PfxEntry* ptr = pFlag[c];
3479
1.08M
    while (ptr) {
3480
172k
      if ((ptr->getFlag() == ap[m]) &&
3481
172k
          (!ptr->getKeyLen() ||
3482
167k
           ((badl > ptr->getKeyLen()) &&
3483
98.1k
            (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3484
          // check needaffix flag
3485
172k
          !(ptr->getCont() &&
3486
70.1k
            ((needaffix &&
3487
64.0k
              TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3488
64.0k
             (circumfix &&
3489
63.3k
              TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3490
64.0k
             (onlyincompound &&
3491
67.0k
              TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
3492
67.0k
        std::string newword = ptr->add(ts, wl);
3493
67.0k
        if (!newword.empty()) {
3494
27.5k
          if (nh < maxn) {
3495
26.6k
            wlst[nh].word = mystrdup(newword.c_str());
3496
26.6k
            wlst[nh].allow = ptr->allowCross();
3497
26.6k
            wlst[nh].orig = NULL;
3498
26.6k
            nh++;
3499
26.6k
          }
3500
27.5k
        }
3501
67.0k
      }
3502
172k
      ptr = ptr->getFlgNxt();
3503
172k
    }
3504
910k
  }
3505
3506
1.06M
  return nh;
3507
1.06M
}
3508
3509
// return replacing table
3510
647k
const std::vector<replentry>& AffixMgr::get_reptable() const {
3511
647k
  return pHMgr->get_reptable();
3512
647k
}
3513
3514
// return iconv table
3515
3.09M
RepList* AffixMgr::get_iconvtable() const {
3516
3.09M
  if (!iconvtable)
3517
2.76M
    return NULL;
3518
326k
  return iconvtable;
3519
3.09M
}
3520
3521
// return oconv table
3522
108k
RepList* AffixMgr::get_oconvtable() const {
3523
108k
  if (!oconvtable)
3524
106k
    return NULL;
3525
2.15k
  return oconvtable;
3526
108k
}
3527
3528
// return replacing table
3529
105k
struct phonetable* AffixMgr::get_phonetable() const {
3530
105k
  if (!phone)
3531
73.2k
    return NULL;
3532
31.8k
  return phone;
3533
105k
}
3534
3535
// return character map table
3536
407k
const std::vector<mapentry>& AffixMgr::get_maptable() const {
3537
407k
  return maptable;
3538
407k
}
3539
3540
// return character map table
3541
18.3k
const std::vector<std::string>& AffixMgr::get_breaktable() const {
3542
18.3k
  return breaktable;
3543
18.3k
}
3544
3545
// return text encoding of dictionary
3546
46.2k
const std::string& AffixMgr::get_encoding() {
3547
46.2k
  if (encoding.empty())
3548
13.0k
    encoding = SPELL_ENCODING;
3549
46.2k
  return encoding;
3550
46.2k
}
3551
3552
// return text encoding of dictionary
3553
36.7k
int AffixMgr::get_langnum() const {
3554
36.7k
  return langnum;
3555
36.7k
}
3556
3557
// return double prefix option
3558
36.7k
int AffixMgr::get_complexprefixes() const {
3559
36.7k
  return complexprefixes;
3560
36.7k
}
3561
3562
// return FULLSTRIP option
3563
5.72M
int AffixMgr::get_fullstrip() const {
3564
5.72M
  return fullstrip;
3565
5.72M
}
3566
3567
153k
FLAG AffixMgr::get_keepcase() const {
3568
153k
  return keepcase;
3569
153k
}
3570
3571
48.5k
FLAG AffixMgr::get_forceucase() const {
3572
48.5k
  return forceucase;
3573
48.5k
}
3574
3575
280k
FLAG AffixMgr::get_warn() const {
3576
280k
  return warn;
3577
280k
}
3578
3579
10.5k
int AffixMgr::get_forbidwarn() const {
3580
10.5k
  return forbidwarn;
3581
10.5k
}
3582
3583
173k
int AffixMgr::get_checksharps() const {
3584
173k
  return checksharps;
3585
173k
}
3586
3587
0
std::string AffixMgr::encode_flag(unsigned short aflag) const {
3588
0
  return pHMgr->encode_flag(aflag);
3589
0
}
3590
3591
// return the preferred ignore string for suggestions
3592
7.88M
const char* AffixMgr::get_ignore() const {
3593
7.88M
  if (ignorechars.empty())
3594
7.83M
    return NULL;
3595
53.9k
  return ignorechars.c_str();
3596
7.88M
}
3597
3598
// return the preferred ignore string for suggestions
3599
27.7k
const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3600
27.7k
  return ignorechars_utf16;
3601
27.7k
}
3602
3603
// return the keyboard string for suggestions
3604
18.3k
const std::string& AffixMgr::get_key_string() {
3605
18.3k
  if (keystring.empty())
3606
18.1k
    keystring = SPELL_KEYSTRING;
3607
18.3k
  return keystring;
3608
18.3k
}
3609
3610
// return the preferred try string for suggestions
3611
18.3k
const std::string& AffixMgr::get_try_string() const {
3612
18.3k
  return trystring;
3613
18.3k
}
3614
3615
// return the preferred try string for suggestions
3616
0
const std::string& AffixMgr::get_wordchars() const {
3617
0
  return wordchars;
3618
0
}
3619
3620
0
const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3621
0
  return wordchars_utf16;
3622
0
}
3623
3624
// is there compounding?
3625
331M
int AffixMgr::get_compound() const {
3626
331M
  return compoundflag || compoundbegin || !defcpdtable.empty();
3627
331M
}
3628
3629
// return the compound words control flag
3630
591k
FLAG AffixMgr::get_compoundflag() const {
3631
591k
  return compoundflag;
3632
591k
}
3633
3634
// return the forbidden words control flag
3635
1.85M
FLAG AffixMgr::get_forbiddenword() const {
3636
1.85M
  return forbiddenword;
3637
1.85M
}
3638
3639
// return the forbidden words control flag
3640
449k
FLAG AffixMgr::get_nosuggest() const {
3641
449k
  return nosuggest;
3642
449k
}
3643
3644
// return the forbidden words control flag
3645
105k
FLAG AffixMgr::get_nongramsuggest() const {
3646
105k
  return nongramsuggest;
3647
105k
}
3648
3649
// return the substandard root/affix control flag
3650
51.2k
FLAG AffixMgr::get_substandard() const {
3651
51.2k
  return substandard;
3652
51.2k
}
3653
3654
// return the forbidden words flag modify flag
3655
14.1M
FLAG AffixMgr::get_needaffix() const {
3656
14.1M
  return needaffix;
3657
14.1M
}
3658
3659
// return the onlyincompound flag
3660
320k
FLAG AffixMgr::get_onlyincompound() const {
3661
320k
  return onlyincompound;
3662
320k
}
3663
3664
// return the value of suffix
3665
0
const std::string& AffixMgr::get_version() const {
3666
0
  return version;
3667
0
}
3668
3669
// utility method to look up root words in hash table
3670
2.09G
struct hentry* AffixMgr::lookup(const char* word, size_t len) {
3671
2.09G
  struct hentry* he = NULL;
3672
4.18G
  for (size_t i = 0; i < alldic.size() && !he; ++i) {
3673
2.09G
    he = alldic[i]->lookup(word, len);
3674
2.09G
  }
3675
2.09G
  return he;
3676
2.09G
}
3677
3678
// return the value of suffix
3679
214M
int AffixMgr::have_contclass() const {
3680
214M
  return havecontclass;
3681
214M
}
3682
3683
// return utf8
3684
36.7k
int AffixMgr::get_utf8() const {
3685
36.7k
  return utf8;
3686
36.7k
}
3687
3688
124k
int AffixMgr::get_maxngramsugs(void) const {
3689
124k
  return maxngramsugs;
3690
124k
}
3691
3692
18.4k
int AffixMgr::get_maxcpdsugs(void) const {
3693
18.4k
  return maxcpdsugs;
3694
18.4k
}
3695
3696
56.4k
int AffixMgr::get_maxdiff(void) const {
3697
56.4k
  return maxdiff;
3698
56.4k
}
3699
3700
17.3k
int AffixMgr::get_onlymaxdiff(void) const {
3701
17.3k
  return onlymaxdiff;
3702
17.3k
}
3703
3704
// return nosplitsugs
3705
18.3k
int AffixMgr::get_nosplitsugs(void) const {
3706
18.3k
  return nosplitsugs;
3707
18.3k
}
3708
3709
// return sugswithdots
3710
25.1k
int AffixMgr::get_sugswithdots(void) const {
3711
25.1k
  return sugswithdots;
3712
25.1k
}
3713
3714
/* parse flag */
3715
13.1k
bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3716
13.1k
  if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3717
255
    HUNSPELL_WARNING(
3718
255
        stderr,
3719
255
        "error: line %d: multiple definitions of an affix file parameter\n",
3720
255
        af->getlinenum());
3721
255
    return false;
3722
255
  }
3723
12.8k
  std::string s;
3724
12.8k
  if (!parse_string(line, s, af->getlinenum()))
3725
128
    return false;
3726
12.7k
  *out = pHMgr->decode_flag(s);
3727
12.7k
  return true;
3728
12.8k
}
3729
3730
/* parse num */
3731
2.31k
bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3732
2.31k
  if (*out != -1) {
3733
52
    HUNSPELL_WARNING(
3734
52
        stderr,
3735
52
        "error: line %d: multiple definitions of an affix file parameter\n",
3736
52
        af->getlinenum());
3737
52
    return false;
3738
52
  }
3739
2.26k
  std::string s;
3740
2.26k
  if (!parse_string(line, s, af->getlinenum()))
3741
25
    return false;
3742
2.23k
  *out = atoi(s.c_str());
3743
2.23k
  return true;
3744
2.26k
}
3745
3746
/* parse in the max syllablecount of compound words and  */
3747
51.4k
bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3748
51.4k
  int i = 0;
3749
51.4k
  int np = 0;
3750
51.4k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
3751
235k
  while (start_piece != line.end()) {
3752
183k
    switch (i) {
3753
51.4k
      case 0: {
3754
51.4k
        np++;
3755
51.4k
        break;
3756
0
      }
3757
51.4k
      case 1: {
3758
51.4k
        cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3759
51.4k
        np++;
3760
51.4k
        break;
3761
0
      }
3762
49.7k
      case 2: {
3763
49.7k
        if (!utf8) {
3764
3.42k
          cpdvowels.assign(start_piece, iter);
3765
3.42k
          std::sort(cpdvowels.begin(), cpdvowels.end());
3766
46.3k
        } else {
3767
46.3k
          std::string piece(start_piece, iter);
3768
46.3k
          u8_u16(cpdvowels_utf16, piece);
3769
46.3k
          std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3770
46.3k
        }
3771
49.7k
        np++;
3772
49.7k
        break;
3773
0
      }
3774
31.1k
      default:
3775
31.1k
        break;
3776
183k
    }
3777
183k
    ++i;
3778
183k
    start_piece = mystrsep(line, iter);
3779
183k
  }
3780
51.4k
  if (np < 2) {
3781
7
    HUNSPELL_WARNING(stderr,
3782
7
                     "error: line %d: missing compoundsyllable information\n",
3783
7
                     af->getlinenum());
3784
7
    return false;
3785
7
  }
3786
51.4k
  if (np == 2)
3787
1.73k
    cpdvowels = "AEIOUaeiou";
3788
51.4k
  return true;
3789
51.4k
}
3790
3791
bool AffixMgr::parse_convtable(const std::string& line,
3792
                              FileMgr* af,
3793
                              RepList** rl,
3794
2.55k
                              const std::string& keyword) {
3795
2.55k
  if (*rl) {
3796
107
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3797
107
                     af->getlinenum());
3798
107
    return false;
3799
107
  }
3800
2.44k
  int i = 0;
3801
2.44k
  int np = 0;
3802
2.44k
  int numrl = 0;
3803
2.44k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
3804
10.8k
  while (start_piece != line.end()) {
3805
8.47k
    switch (i) {
3806
2.44k
      case 0: {
3807
2.44k
        np++;
3808
2.44k
        break;
3809
0
      }
3810
2.42k
      case 1: {
3811
2.42k
        numrl = atoi(std::string(start_piece, iter).c_str());
3812
2.42k
        if (numrl < 1) {
3813
93
          HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3814
93
                           af->getlinenum());
3815
93
          return false;
3816
93
        }
3817
2.33k
        *rl = new RepList(numrl);
3818
2.33k
        if (!*rl)
3819
0
          return false;
3820
2.33k
        np++;
3821
2.33k
        break;
3822
2.33k
      }
3823
3.60k
      default:
3824
3.60k
        break;
3825
8.47k
    }
3826
8.38k
    ++i;
3827
8.38k
    start_piece = mystrsep(line, iter);
3828
8.38k
  }
3829
2.35k
  if (np != 2) {
3830
25
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3831
25
                     af->getlinenum());
3832
25
    return false;
3833
25
  }
3834
3835
  /* now parse the num lines to read in the remainder of the table */
3836
12.5k
  for (int j = 0; j < numrl; j++) {
3837
11.0k
    std::string nl;
3838
11.0k
    if (!af->getline(nl))
3839
518
      return false;
3840
10.4k
    mychomp(nl);
3841
10.4k
    i = 0;
3842
10.4k
    std::string pattern;
3843
10.4k
    std::string pattern2;
3844
10.4k
    iter = nl.begin();
3845
10.4k
    start_piece = mystrsep(nl, iter);
3846
50.2k
    while (start_piece != nl.end()) {
3847
39.8k
      {
3848
39.8k
        switch (i) {
3849
10.4k
          case 0: {
3850
10.4k
            if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3851
148
              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3852
148
                               af->getlinenum());
3853
148
              delete *rl;
3854
148
              *rl = NULL;
3855
148
              return false;
3856
148
            }
3857
10.3k
            break;
3858
10.4k
          }
3859
10.3k
          case 1: {
3860
10.3k
            pattern.assign(start_piece, iter);
3861
10.3k
            break;
3862
10.4k
          }
3863
10.2k
          case 2: {
3864
10.2k
            pattern2.assign(start_piece, iter);
3865
10.2k
            break;
3866
10.4k
          }
3867
8.86k
          default:
3868
8.86k
            break;
3869
39.8k
        }
3870
39.7k
        ++i;
3871
39.7k
      }
3872
0
      start_piece = mystrsep(nl, iter);
3873
39.7k
    }
3874
10.3k
    if (pattern.empty() || pattern2.empty()) {
3875
105
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3876
105
                       af->getlinenum());
3877
105
      return false;
3878
105
    }
3879
3880
10.2k
    (*rl)->add(pattern, pattern2);
3881
10.2k
  }
3882
1.55k
  return true;
3883
2.33k
}
3884
3885
/* parse in the typical fault correcting table */
3886
1.33k
bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3887
1.33k
  if (phone) {
3888
50
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3889
50
                     af->getlinenum());
3890
50
    return false;
3891
50
  }
3892
1.28k
  std::unique_ptr<phonetable> new_phone;
3893
1.28k
  int num = -1;
3894
1.28k
  int i = 0;
3895
1.28k
  int np = 0;
3896
1.28k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
3897
4.74k
  while (start_piece != line.end()) {
3898
3.52k
    switch (i) {
3899
1.28k
      case 0: {
3900
1.28k
        np++;
3901
1.28k
        break;
3902
0
      }
3903
1.26k
      case 1: {
3904
1.26k
        num = atoi(std::string(start_piece, iter).c_str());
3905
1.26k
        if (num < 1) {
3906
67
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3907
67
                           af->getlinenum());
3908
67
          return false;
3909
67
        }
3910
1.19k
        new_phone.reset(new phonetable);
3911
1.19k
        new_phone->utf8 = (char)utf8;
3912
1.19k
        np++;
3913
1.19k
        break;
3914
1.26k
      }
3915
979
      default:
3916
979
        break;
3917
3.52k
    }
3918
3.46k
    ++i;
3919
3.46k
    start_piece = mystrsep(line, iter);
3920
3.46k
  }
3921
1.21k
  if (np != 2) {
3922
18
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3923
18
                     af->getlinenum());
3924
18
    return false;
3925
18
  }
3926
3927
  /* now parse the phone->num lines to read in the remainder of the table */
3928
3.44k
  for (int j = 0; j < num; ++j) {
3929
2.43k
    std::string nl;
3930
2.43k
    if (!af->getline(nl))
3931
67
      return false;
3932
2.36k
    mychomp(nl);
3933
2.36k
    i = 0;
3934
2.36k
    const size_t old_size = new_phone->rules.size();
3935
2.36k
    iter = nl.begin();
3936
2.36k
    start_piece = mystrsep(nl, iter);
3937
11.6k
    while (start_piece != nl.end()) {
3938
9.38k
      {
3939
9.38k
        switch (i) {
3940
2.36k
          case 0: {
3941
2.36k
            if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3942
69
              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3943
69
                               af->getlinenum());
3944
69
              return false;
3945
69
            }
3946
2.29k
            break;
3947
2.36k
          }
3948
2.29k
          case 1: {
3949
2.28k
            new_phone->rules.emplace_back(start_piece, iter);
3950
2.28k
            break;
3951
2.36k
          }
3952
2.24k
          case 2: {
3953
2.24k
            new_phone->rules.emplace_back(start_piece, iter);
3954
2.24k
            mystrrep(new_phone->rules.back(), "_", "");
3955
2.24k
            break;
3956
2.36k
          }
3957
2.48k
          default:
3958
2.48k
            break;
3959
9.38k
        }
3960
9.31k
        ++i;
3961
9.31k
      }
3962
0
      start_piece = mystrsep(nl, iter);
3963
9.31k
    }
3964
2.29k
    if (new_phone->rules.size() != old_size + 2) {
3965
49
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3966
49
                       af->getlinenum());
3967
49
      return false;
3968
49
    }
3969
2.29k
  }
3970
1.01k
  new_phone->rules.emplace_back("");
3971
1.01k
  new_phone->rules.emplace_back("");
3972
1.01k
  init_phonet_hash(*new_phone);
3973
1.01k
  phone = new_phone.release();
3974
1.01k
  return true;
3975
1.19k
}
3976
3977
/* parse in the checkcompoundpattern table */
3978
869
bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3979
869
  if (parsedcheckcpd) {
3980
4
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3981
4
                     af->getlinenum());
3982
4
    return false;
3983
4
  }
3984
865
  parsedcheckcpd = true;
3985
865
  int numcheckcpd = -1;
3986
865
  int i = 0;
3987
865
  int np = 0;
3988
865
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
3989
3.72k
  while (start_piece != line.end()) {
3990
2.90k
    switch (i) {
3991
865
      case 0: {
3992
865
        np++;
3993
865
        break;
3994
0
      }
3995
863
      case 1: {
3996
863
        numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3997
863
        if (numcheckcpd < 1) {
3998
45
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3999
45
                           af->getlinenum());
4000
45
          return false;
4001
45
        }
4002
818
        checkcpdtable.reserve(std::min(numcheckcpd, 16384));
4003
818
        np++;
4004
818
        break;
4005
863
      }
4006
1.18k
      default:
4007
1.18k
        break;
4008
2.90k
    }
4009
2.86k
    ++i;
4010
2.86k
    start_piece = mystrsep(line, iter);
4011
2.86k
  }
4012
820
  if (np != 2) {
4013
2
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4014
2
                     af->getlinenum());
4015
2
    return false;
4016
2
  }
4017
4018
  /* now parse the numcheckcpd lines to read in the remainder of the table */
4019
9.30k
  for (int j = 0; j < numcheckcpd; ++j) {
4020
9.23k
    std::string nl;
4021
9.23k
    if (!af->getline(nl))
4022
582
      return false;
4023
8.65k
    mychomp(nl);
4024
8.65k
    i = 0;
4025
8.65k
    checkcpdtable.emplace_back();
4026
8.65k
    iter = nl.begin();
4027
8.65k
    start_piece = mystrsep(nl, iter);
4028
35.2k
    while (start_piece != nl.end()) {
4029
26.7k
      switch (i) {
4030
8.06k
        case 0: {
4031
8.06k
          if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4032
169
            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4033
169
                             af->getlinenum());
4034
169
            checkcpdtable.clear();
4035
169
            return false;
4036
169
          }
4037
7.89k
          break;
4038
8.06k
        }
4039
7.89k
        case 1: {
4040
6.84k
          checkcpdtable.back().pattern.assign(start_piece, iter);
4041
6.84k
          size_t slash_pos = checkcpdtable.back().pattern.find('/');
4042
6.84k
          if (slash_pos != std::string::npos) {
4043
5.23k
            std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4044
5.23k
            checkcpdtable.back().pattern.resize(slash_pos);
4045
5.23k
            checkcpdtable.back().cond = pHMgr->decode_flag(chunk);
4046
5.23k
          }
4047
6.84k
          break;
4048
8.06k
        }
4049
5.38k
        case 2: {
4050
5.38k
          checkcpdtable.back().pattern2.assign(start_piece, iter);
4051
5.38k
          size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4052
5.38k
          if (slash_pos != std::string::npos) {
4053
4.35k
            std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4054
4.35k
            checkcpdtable.back().pattern2.resize(slash_pos);
4055
4.35k
            checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk);
4056
4.35k
          }
4057
5.38k
          break;
4058
8.06k
        }
4059
3.37k
        case 3: {
4060
3.37k
          checkcpdtable.back().pattern3.assign(start_piece, iter);
4061
3.37k
          simplifiedcpd = 1;
4062
3.37k
          break;
4063
8.06k
        }
4064
3.10k
        default:
4065
3.10k
          break;
4066
26.7k
      }
4067
26.6k
      i++;
4068
26.6k
      start_piece = mystrsep(nl, iter);
4069
26.6k
    }
4070
8.65k
  }
4071
67
  return true;
4072
818
}
4073
4074
/* parse in the compound rule table */
4075
933
bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4076
933
  if (parseddefcpd) {
4077
4
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4078
4
                     af->getlinenum());
4079
4
    return false;
4080
4
  }
4081
929
  parseddefcpd = true;
4082
929
  int numdefcpd = -1;
4083
929
  int i = 0;
4084
929
  int np = 0;
4085
929
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
4086
3.38k
  while (start_piece != line.end()) {
4087
2.50k
    switch (i) {
4088
929
      case 0: {
4089
929
        np++;
4090
929
        break;
4091
0
      }
4092
916
      case 1: {
4093
916
        numdefcpd = atoi(std::string(start_piece, iter).c_str());
4094
916
        if (numdefcpd < 1) {
4095
46
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4096
46
                           af->getlinenum());
4097
46
          return false;
4098
46
        }
4099
870
        defcpdtable.reserve(std::min(numdefcpd, 16384));
4100
870
        np++;
4101
870
        break;
4102
916
      }
4103
656
      default:
4104
656
        break;
4105
2.50k
    }
4106
2.45k
    ++i;
4107
2.45k
    start_piece = mystrsep(line, iter);
4108
2.45k
  }
4109
883
  if (np != 2) {
4110
13
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4111
13
                     af->getlinenum());
4112
13
    return false;
4113
13
  }
4114
4115
  /* now parse the numdefcpd lines to read in the remainder of the table */
4116
5.54k
  for (int j = 0; j < numdefcpd; ++j) {
4117
5.47k
    std::string nl;
4118
5.47k
    if (!af->getline(nl))
4119
390
      return false;
4120
5.08k
    mychomp(nl);
4121
5.08k
    i = 0;
4122
5.08k
    defcpdtable.emplace_back();
4123
5.08k
    iter = nl.begin();
4124
5.08k
    start_piece = mystrsep(nl, iter);
4125
17.2k
    while (start_piece != nl.end()) {
4126
12.5k
      switch (i) {
4127
5.08k
        case 0: {
4128
5.08k
          if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4129
378
            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4130
378
                             af->getlinenum());
4131
378
            numdefcpd = 0;
4132
378
            return false;
4133
378
          }
4134
4.70k
          break;
4135
5.08k
        }
4136
4.70k
        case 1: {  // handle parenthesized flags
4137
4.68k
          if (std::find(start_piece, iter, '(') != iter) {
4138
302k
            for (auto k = start_piece; k != iter; ++k) {
4139
302k
              auto chb = k, che = k + 1;
4140
302k
              if (*k == '(') {
4141
2.89k
              auto parpos = std::find(k, iter, ')');
4142
2.89k
                if (parpos != iter) {
4143
1.15k
                  chb = k + 1;
4144
1.15k
                  che = parpos;
4145
1.15k
                  k = parpos;
4146
1.15k
                }
4147
2.89k
              }
4148
4149
302k
              if (*chb == '*' || *chb == '?') {
4150
6.00k
                defcpdtable.back().push_back((FLAG)*chb);
4151
296k
              } else {
4152
296k
                pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4153
296k
              }
4154
302k
            }
4155
3.99k
          } else {
4156
3.99k
            pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4157
3.99k
          }
4158
4.68k
          break;
4159
5.08k
        }
4160
2.80k
        default:
4161
2.80k
          break;
4162
12.5k
      }
4163
12.2k
      ++i;
4164
12.2k
      start_piece = mystrsep(nl, iter);
4165
12.2k
    }
4166
4.71k
    if (defcpdtable.back().empty()) {
4167
32
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4168
32
                       af->getlinenum());
4169
32
      return false;
4170
32
    }
4171
4.71k
  }
4172
70
  return true;
4173
870
}
4174
4175
/* parse in the character map table */
4176
731
bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4177
731
  if (parsedmaptable) {
4178
18
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4179
18
                     af->getlinenum());
4180
18
    return false;
4181
18
  }
4182
713
  parsedmaptable = true;
4183
713
  int nummap = -1;
4184
713
  int i = 0;
4185
713
  int np = 0;
4186
713
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
4187
3.04k
  while (start_piece != line.end()) {
4188
2.42k
    switch (i) {
4189
713
      case 0: {
4190
713
        np++;
4191
713
        break;
4192
0
      }
4193
705
      case 1: {
4194
705
        nummap = atoi(std::string(start_piece, iter).c_str());
4195
705
        if (nummap < 1) {
4196
90
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4197
90
                           af->getlinenum());
4198
90
          return false;
4199
90
        }
4200
615
        maptable.reserve(std::min(nummap, 16384));
4201
615
        np++;
4202
615
        break;
4203
705
      }
4204
1.00k
      default:
4205
1.00k
        break;
4206
2.42k
    }
4207
2.33k
    ++i;
4208
2.33k
    start_piece = mystrsep(line, iter);
4209
2.33k
  }
4210
623
  if (np != 2) {
4211
8
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4212
8
                     af->getlinenum());
4213
8
    return false;
4214
8
  }
4215
4216
  /* now parse the nummap lines to read in the remainder of the table */
4217
2.92k
  for (int j = 0; j < nummap; ++j) {
4218
2.61k
    std::string nl;
4219
2.61k
    if (!af->getline(nl))
4220
192
      return false;
4221
2.41k
    mychomp(nl);
4222
2.41k
    i = 0;
4223
2.41k
    maptable.emplace_back();
4224
2.41k
    iter = nl.begin();
4225
2.41k
    start_piece = mystrsep(nl, iter);
4226
8.20k
    while (start_piece != nl.end()) {
4227
5.88k
      switch (i) {
4228
2.41k
        case 0: {
4229
2.41k
          if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4230
95
            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4231
95
                             af->getlinenum());
4232
95
            nummap = 0;
4233
95
            return false;
4234
95
          }
4235
2.31k
          break;
4236
2.41k
        }
4237
2.31k
        case 1: {
4238
390k
          for (auto k = start_piece; k != iter; ++k) {
4239
388k
            auto chb = k, che = k + 1;
4240
388k
            if (*k == '(') {
4241
2.57k
              auto parpos = std::find(k, iter, ')');
4242
2.57k
              if (parpos != iter) {
4243
877
                chb = k + 1;
4244
877
                che = parpos;
4245
877
                k = parpos;
4246
877
              }
4247
385k
            } else {
4248
385k
              if (utf8 && (*k & 0xc0) == 0xc0) {
4249
2.99k
                ++k;
4250
3.47k
                while (k != iter && (*k & 0xc0) == 0x80)
4251
477
                    ++k;
4252
2.99k
                che = k;
4253
2.99k
                --k;
4254
2.99k
              }
4255
385k
            }
4256
388k
            if (chb == che) {
4257
511
              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4258
511
                              af->getlinenum());
4259
511
            }
4260
4261
388k
            maptable.back().emplace_back(chb, che);
4262
388k
          }
4263
2.30k
          break;
4264
2.41k
        }
4265
1.16k
        default:
4266
1.16k
          break;
4267
5.88k
      }
4268
5.79k
      ++i;
4269
5.79k
      start_piece = mystrsep(nl, iter);
4270
5.79k
    }
4271
2.32k
    if (maptable.back().empty()) {
4272
15
      HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4273
15
                       af->getlinenum());
4274
15
      return false;
4275
15
    }
4276
2.32k
  }
4277
313
  return true;
4278
615
}
4279
4280
/* parse in the word breakpoint table */
4281
1.94k
bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4282
1.94k
  if (parsedbreaktable) {
4283
46
    HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4284
46
                     af->getlinenum());
4285
46
    return false;
4286
46
  }
4287
1.89k
  parsedbreaktable = true;
4288
1.89k
  int numbreak = -1;
4289
1.89k
  int i = 0;
4290
1.89k
  int np = 0;
4291
1.89k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
4292
4.84k
  while (start_piece != line.end()) {
4293
4.46k
    switch (i) {
4294
1.89k
      case 0: {
4295
1.89k
        np++;
4296
1.89k
        break;
4297
0
      }
4298
1.81k
      case 1: {
4299
1.81k
        numbreak = atoi(std::string(start_piece, iter).c_str());
4300
1.81k
        if (numbreak < 0) {
4301
34
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4302
34
                           af->getlinenum());
4303
34
          return false;
4304
34
        }
4305
1.77k
        if (numbreak == 0)
4306
1.48k
          return true;
4307
297
        breaktable.reserve(std::min(numbreak, 16384));
4308
297
        np++;
4309
297
        break;
4310
1.77k
      }
4311
755
      default:
4312
755
        break;
4313
4.46k
    }
4314
2.94k
    ++i;
4315
2.94k
    start_piece = mystrsep(line, iter);
4316
2.94k
  }
4317
381
  if (np != 2) {
4318
84
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4319
84
                     af->getlinenum());
4320
84
    return false;
4321
84
  }
4322
4323
  /* now parse the numbreak lines to read in the remainder of the table */
4324
4.34k
  for (int j = 0; j < numbreak; ++j) {
4325
4.29k
    std::string nl;
4326
4.29k
    if (!af->getline(nl))
4327
155
      return false;
4328
4.13k
    mychomp(nl);
4329
4.13k
    i = 0;
4330
4.13k
    iter = nl.begin();
4331
4.13k
    start_piece = mystrsep(nl, iter);
4332
10.8k
    while (start_piece != nl.end()) {
4333
6.84k
      switch (i) {
4334
2.62k
        case 0: {
4335
2.62k
          if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4336
89
            HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4337
89
                             af->getlinenum());
4338
89
            numbreak = 0;
4339
89
            return false;
4340
89
          }
4341
2.53k
          break;
4342
2.62k
        }
4343
2.53k
        case 1: {
4344
2.49k
          breaktable.emplace_back(start_piece, iter);
4345
2.49k
          break;
4346
2.62k
        }
4347
1.72k
        default:
4348
1.72k
          break;
4349
6.84k
      }
4350
6.75k
      ++i;
4351
6.75k
      start_piece = mystrsep(nl, iter);
4352
6.75k
    }
4353
4.13k
  }
4354
4355
53
  if (breaktable.size() != static_cast<size_t>(numbreak)) {
4356
3
    HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4357
3
                     af->getlinenum());
4358
3
    return false;
4359
3
  }
4360
4361
50
  return true;
4362
53
}
4363
4364
28.6k
void AffixMgr::reverse_condition(std::string& piece) {
4365
28.6k
  if (piece.empty())
4366
0
      return;
4367
4368
28.6k
  int neg = 0;
4369
1.81M
  for (auto k = piece.rbegin(); k != piece.rend(); ++k) {
4370
1.79M
    switch (*k) {
4371
50.4k
      case '[': {
4372
50.4k
        if (neg)
4373
1.36k
          *(k - 1) = '[';
4374
49.1k
        else
4375
49.1k
          *k = ']';
4376
50.4k
        break;
4377
0
      }
4378
35.5k
      case ']': {
4379
35.5k
        *k = '[';
4380
35.5k
        if (neg)
4381
7.12k
          *(k - 1) = '^';
4382
35.5k
        neg = 0;
4383
35.5k
        break;
4384
0
      }
4385
35.7k
      case '^': {
4386
35.7k
        if (*(k - 1) == ']')
4387
9.69k
          neg = 1;
4388
26.0k
        else if (neg)
4389
11.7k
          *(k - 1) = *k;
4390
35.7k
        break;
4391
0
      }
4392
1.66M
      default: {
4393
1.66M
        if (neg)
4394
165k
          *(k - 1) = *k;
4395
1.66M
      }
4396
1.79M
    }
4397
1.79M
  }
4398
28.6k
}
4399
4400
class entries_container {
4401
  std::vector<AffEntry*> entries;
4402
  AffixMgr* m_mgr;
4403
  char m_at;
4404
public:
4405
  entries_container(char at, AffixMgr* mgr)
4406
35.1k
    : m_mgr(mgr)
4407
35.1k
    , m_at(at) {
4408
35.1k
  }
4409
33.3k
  void release() {
4410
33.3k
    entries.clear();
4411
33.3k
  }
4412
  void initialize(int numents,
4413
34.4k
                  char opts, unsigned short aflag) {
4414
34.4k
    entries.reserve(std::min(numents, 16384));
4415
4416
34.4k
    if (m_at == 'P') {
4417
14.0k
      entries.push_back(new PfxEntry(m_mgr));
4418
20.3k
    } else {
4419
20.3k
      entries.push_back(new SfxEntry(m_mgr));
4420
20.3k
    }
4421
4422
34.4k
    entries.back()->opts = opts;
4423
34.4k
    entries.back()->aflag = aflag;
4424
34.4k
  }
4425
4426
19.2k
  AffEntry* add_entry(char opts) {
4427
19.2k
    if (m_at == 'P') {
4428
9.14k
      entries.push_back(new PfxEntry(m_mgr));
4429
10.1k
    } else {
4430
10.1k
      entries.push_back(new SfxEntry(m_mgr));
4431
10.1k
    }
4432
19.2k
    AffEntry* ret = entries.back();
4433
19.2k
    ret->opts = entries[0]->opts & opts;
4434
19.2k
    return ret;
4435
19.2k
  }
4436
4437
53.5k
  AffEntry* first_entry() {
4438
53.5k
    return entries.empty() ? NULL : entries[0];
4439
53.5k
  }
4440
4441
35.1k
  ~entries_container() {
4442
35.1k
    for (auto& entry : entries) {
4443
7.96k
      delete entry;
4444
7.96k
    }
4445
35.1k
  }
4446
4447
33.3k
  std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
4448
33.3k
  std::vector<AffEntry*>::iterator end() { return entries.end(); }
4449
};
4450
4451
bool AffixMgr::parse_affix(const std::string& line,
4452
                          const char at,
4453
                          FileMgr* af,
4454
35.1k
                          char* dupflags) {
4455
35.1k
  int numents = 0;  // number of AffEntry structures to parse
4456
4457
35.1k
  unsigned short aflag = 0;  // affix char identifier
4458
4459
35.1k
  char ff = 0;
4460
35.1k
  entries_container affentries(at, this);
4461
4462
35.1k
  int i = 0;
4463
4464
// checking lines with bad syntax
4465
#ifdef DEBUG
4466
  int basefieldnum = 0;
4467
#endif
4468
4469
  // split affix header line into pieces
4470
4471
35.1k
  int np = 0;
4472
35.1k
  auto iter = line.begin(), start_piece = mystrsep(line, iter);
4473
189k
  while (start_piece != line.end()) {
4474
154k
    switch (i) {
4475
      // piece 1 - is type of affix
4476
35.1k
      case 0: {
4477
35.1k
        np++;
4478
35.1k
        break;
4479
0
      }
4480
4481
      // piece 2 - is affix char
4482
34.9k
      case 1: {
4483
34.9k
        np++;
4484
34.9k
        aflag = pHMgr->decode_flag(std::string(start_piece, iter));
4485
34.9k
        if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4486
34.9k
            ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4487
13.9k
          HUNSPELL_WARNING(
4488
13.9k
              stderr,
4489
13.9k
              "error: line %d: multiple definitions of an affix flag\n",
4490
13.9k
              af->getlinenum());
4491
13.9k
        }
4492
34.9k
        dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
4493
34.9k
        break;
4494
0
      }
4495
      // piece 3 - is cross product indicator
4496
34.7k
      case 2: {
4497
34.7k
        np++;
4498
34.7k
        if (*start_piece == 'Y')
4499
4.96k
          ff = aeXPRODUCT;
4500
34.7k
        break;
4501
0
      }
4502
4503
      // piece 4 - is number of affentries
4504
34.5k
      case 3: {
4505
34.5k
        np++;
4506
34.5k
        numents = atoi(std::string(start_piece, iter).c_str());
4507
34.5k
        if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4508
34.4k
                                sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4509
178
          std::string err = pHMgr->encode_flag(aflag);
4510
178
          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4511
178
                           af->getlinenum());
4512
178
          return false;
4513
178
        }
4514
4515
34.4k
        char opts = ff;
4516
34.4k
        if (utf8)
4517
7.25k
          opts |= aeUTF8;
4518
34.4k
        if (pHMgr->is_aliasf())
4519
2.14k
          opts |= aeALIASF;
4520
34.4k
        if (pHMgr->is_aliasm())
4521
4.16k
          opts |= aeALIASM;
4522
34.4k
        affentries.initialize(numents, opts, aflag);
4523
34.4k
      }
4524
4525
49.2k
      default:
4526
49.2k
        break;
4527
154k
    }
4528
154k
    ++i;
4529
154k
    start_piece = mystrsep(line, iter);
4530
154k
  }
4531
  // check to make sure we parsed enough pieces
4532
34.9k
  if (np != 4) {
4533
506
    std::string err = pHMgr->encode_flag(aflag);
4534
506
    HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4535
506
                     af->getlinenum());
4536
506
    return false;
4537
506
  }
4538
4539
  // now parse numents affentries for this affix
4540
34.4k
  AffEntry* entry = affentries.first_entry();
4541
87.2k
  for (int ent = 0; ent < numents; ++ent) {
4542
53.8k
    std::string nl;
4543
53.8k
    if (!af->getline(nl))
4544
316
      return false;
4545
53.5k
    mychomp(nl);
4546
4547
53.5k
    iter = nl.begin();
4548
53.5k
    i = 0;
4549
53.5k
    np = 0;
4550
4551
    // split line into pieces
4552
53.5k
    start_piece = mystrsep(nl, iter);
4553
338k
    while (start_piece != nl.end()) {
4554
285k
      switch (i) {
4555
        // piece 1 - is type
4556
53.5k
        case 0: {
4557
53.5k
          np++;
4558
53.5k
          if (ent != 0)
4559
19.2k
            entry = affentries.add_entry((char)(aeXPRODUCT | aeUTF8 | aeALIASF | aeALIASM));
4560
53.5k
          break;
4561
0
        }
4562
4563
        // piece 2 - is affix char
4564
53.3k
        case 1: {
4565
53.3k
          np++;
4566
53.3k
          std::string chunk(start_piece, iter);
4567
53.3k
          if (pHMgr->decode_flag(chunk) != aflag) {
4568
285
            std::string err = pHMgr->encode_flag(aflag);
4569
285
            HUNSPELL_WARNING(stderr,
4570
285
                             "error: line %d: affix %s is corrupt\n",
4571
285
                             af->getlinenum(), err.c_str());
4572
285
            return false;
4573
285
          }
4574
4575
53.0k
          if (ent != 0) {
4576
19.1k
            AffEntry* start_entry = affentries.first_entry();
4577
19.1k
            entry->aflag = start_entry->aflag;
4578
19.1k
          }
4579
53.0k
          break;
4580
53.3k
        }
4581
4582
        // piece 3 - is string to strip or 0 for null
4583
52.9k
        case 2: {
4584
52.9k
          np++;
4585
52.9k
          entry->strip = std::string(start_piece, iter);
4586
52.9k
          if (complexprefixes) {
4587
21.6k
            if (utf8)
4588
5.24k
              reverseword_utf(entry->strip);
4589
16.4k
            else
4590
16.4k
              reverseword(entry->strip);
4591
21.6k
          }
4592
52.9k
          if (entry->strip.compare("0") == 0) {
4593
2.58k
            entry->strip.clear();
4594
2.58k
          }
4595
52.9k
          break;
4596
53.3k
        }
4597
4598
        // piece 4 - is affix string or 0 for null
4599
52.8k
        case 3: {
4600
52.8k
          entry->morphcode = NULL;
4601
52.8k
          entry->contclass = NULL;
4602
52.8k
          entry->contclasslen = 0;
4603
52.8k
          np++;
4604
52.8k
          std::string::const_iterator dash = std::find(start_piece, iter, '/');
4605
52.8k
          if (dash != iter) {
4606
17.9k
            entry->appnd = std::string(start_piece, dash);
4607
17.9k
            std::string dash_str(dash + 1, iter);
4608
4609
17.9k
            if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4610
1.35k
              if (utf8) {
4611
455
                remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4612
900
              } else {
4613
900
                remove_ignored_chars(entry->appnd, ignorechars);
4614
900
              }
4615
1.35k
            }
4616
4617
17.9k
            if (complexprefixes) {
4618
6.80k
              if (utf8)
4619
1.21k
                reverseword_utf(entry->appnd);
4620
5.58k
              else
4621
5.58k
                reverseword(entry->appnd);
4622
6.80k
            }
4623
4624
17.9k
            if (pHMgr->is_aliasf()) {
4625
1.76k
              int index = atoi(dash_str.c_str());
4626
1.76k
              entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4627
1.76k
                  index, &(entry->contclass), af);
4628
1.76k
              if (!entry->contclasslen)
4629
861
                HUNSPELL_WARNING(stderr,
4630
861
                                 "error: bad affix flag alias: \"%s\"\n",
4631
861
                                 dash_str.c_str());
4632
16.1k
            } else {
4633
16.1k
              entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4634
16.1k
                  &(entry->contclass), dash_str, af);
4635
16.1k
              std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4636
16.1k
            }
4637
4638
17.9k
            havecontclass = 1;
4639
736k
            for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4640
718k
              contclasses[(entry->contclass)[_i]] = 1;
4641
718k
            }
4642
34.8k
          } else {
4643
34.8k
            entry->appnd = std::string(start_piece, iter);
4644
4645
34.8k
            if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4646
3.58k
              if (utf8) {
4647
1.03k
                remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4648
2.55k
              } else {
4649
2.55k
                remove_ignored_chars(entry->appnd, ignorechars);
4650
2.55k
              }
4651
3.58k
            }
4652
4653
34.8k
            if (complexprefixes) {
4654
14.8k
              if (utf8)
4655
4.01k
                reverseword_utf(entry->appnd);
4656
10.8k
              else
4657
10.8k
                reverseword(entry->appnd);
4658
14.8k
            }
4659
34.8k
          }
4660
4661
52.8k
          if (entry->appnd.compare("0") == 0) {
4662
1.72k
            entry->appnd.clear();
4663
1.72k
          }
4664
52.8k
          break;
4665
53.3k
        }
4666
4667
        // piece 5 - is the conditions descriptions
4668
27.8k
        case 4: {
4669
27.8k
          std::string chunk(start_piece, iter);
4670
27.8k
          np++;
4671
27.8k
          if (complexprefixes) {
4672
11.8k
            if (utf8)
4673
2.32k
              reverseword_utf(chunk);
4674
9.54k
            else
4675
9.54k
              reverseword(chunk);
4676
11.8k
            reverse_condition(chunk);
4677
11.8k
          }
4678
27.8k
          if (!entry->strip.empty() && chunk != "." &&
4679
27.8k
              redundant_condition(at, entry->strip, chunk,
4680
27.3k
                                  af->getlinenum()))
4681
3.01k
            chunk = ".";
4682
27.8k
          if (at == 'S') {
4683
16.7k
            reverseword(chunk);
4684
16.7k
            reverse_condition(chunk);
4685
16.7k
          }
4686
27.8k
          if (encodeit(*entry, chunk))
4687
0
            return false;
4688
27.8k
          break;
4689
27.8k
        }
4690
4691
27.8k
        case 5: {
4692
19.7k
          std::string chunk(start_piece, iter);
4693
19.7k
          np++;
4694
19.7k
          if (pHMgr->is_aliasm()) {
4695
3.05k
            int index = atoi(chunk.c_str());
4696
3.05k
            entry->morphcode = pHMgr->get_aliasm(index);
4697
16.6k
          } else {
4698
16.6k
            if (complexprefixes) {  // XXX - fix me for morph. gen.
4699
6.39k
              if (utf8)
4700
1.63k
                reverseword_utf(chunk);
4701
4.75k
              else
4702
4.75k
                reverseword(chunk);
4703
6.39k
            }
4704
            // add the remaining of the line
4705
16.6k
            std::string::const_iterator end = nl.end();
4706
16.6k
            if (iter != end) {
4707
8.22k
              chunk.append(iter, end);
4708
8.22k
            }
4709
16.6k
            entry->morphcode = mystrdup(chunk.c_str());
4710
16.6k
          }
4711
19.7k
          break;
4712
27.8k
        }
4713
25.0k
        default:
4714
25.0k
          break;
4715
285k
      }
4716
284k
      i++;
4717
284k
      start_piece = mystrsep(nl, iter);
4718
284k
    }
4719
    // check to make sure we parsed enough pieces
4720
53.2k
    if (np < 4) {
4721
459
      std::string err = pHMgr->encode_flag(aflag);
4722
459
      HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4723
459
                       af->getlinenum(), err.c_str());
4724
459
      return false;
4725
459
    }
4726
4727
#ifdef DEBUG
4728
    // detect unnecessary fields, excepting comments
4729
    if (basefieldnum) {
4730
      int fieldnum =
4731
          !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4732
      if (fieldnum != basefieldnum)
4733
        HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
4734
                         af->getlinenum());
4735
    } else {
4736
      basefieldnum =
4737
          !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4738
    }
4739
#endif
4740
53.2k
  }
4741
4742
  // now create SfxEntry or PfxEntry objects and use links to
4743
  // build an ordered (sorted by affix string) list
4744
33.3k
  auto start = affentries.begin(), end = affentries.end();
4745
79.0k
  for (auto affentry = start; affentry != end; ++affentry) {
4746
45.7k
    if (at == 'P') {
4747
18.5k
      build_pfxtree(dynamic_cast<PfxEntry*>(*affentry));
4748
27.1k
    } else {
4749
27.1k
      build_sfxtree(dynamic_cast<SfxEntry*>(*affentry));
4750
27.1k
    }
4751
45.7k
  }
4752
4753
  //contents belong to AffixMgr now
4754
33.3k
  affentries.release();
4755
4756
33.3k
  return true;
4757
34.4k
}
4758
4759
int AffixMgr::redundant_condition(char ft,
4760
                                  const std::string& strip,
4761
                                  const std::string& cond,
4762
27.3k
                                  int linenum) {
4763
27.3k
  int stripl = strip.size(), condl = cond.size(), i, j, neg, in;
4764
27.3k
  if (ft == 'P') {  // prefix
4765
11.0k
    if (strip.compare(0, condl, cond) == 0)
4766
1.30k
      return 1;
4767
9.72k
    if (utf8) {
4768
8.08k
    } else {
4769
13.3k
      for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4770
12.2k
        if (cond[j] != '[') {
4771
8.55k
          if (cond[j] != strip[i]) {
4772
4.25k
            HUNSPELL_WARNING(stderr,
4773
4.25k
                             "warning: line %d: incompatible stripping "
4774
4.25k
                             "characters and condition\n",
4775
4.25k
                             linenum);
4776
4.25k
            return 0;
4777
4.25k
          }
4778
8.55k
        } else {
4779
3.70k
          neg = (cond[j + 1] == '^') ? 1 : 0;
4780
3.70k
          in = 0;
4781
333k
          do {
4782
333k
            j++;
4783
333k
            if (strip[i] == cond[j])
4784
2.03k
              in = 1;
4785
333k
          } while ((j < (condl - 1)) && (cond[j] != ']'));
4786
3.70k
          if (j == (condl - 1) && (cond[j] != ']')) {
4787
668
            HUNSPELL_WARNING(stderr,
4788
668
                             "error: line %d: missing ] in condition:\n%s\n",
4789
668
                             linenum, cond.c_str());
4790
668
            return 0;
4791
668
          }
4792
3.03k
          if ((!neg && !in) || (neg && in)) {
4793
2.08k
            HUNSPELL_WARNING(stderr,
4794
2.08k
                             "warning: line %d: incompatible stripping "
4795
2.08k
                             "characters and condition\n",
4796
2.08k
                             linenum);
4797
2.08k
            return 0;
4798
2.08k
          }
4799
3.03k
        }
4800
12.2k
      }
4801
1.08k
      if (j >= condl)
4802
275
        return 1;
4803
1.08k
    }
4804
16.2k
  } else {  // suffix
4805
16.2k
    if ((stripl >= condl) && strip.compare(stripl - condl, std::string::npos, cond) == 0)
4806
778
      return 1;
4807
15.5k
    if (utf8) {
4808
12.6k
    } else {
4809
17.7k
      for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4810
16.7k
        if (cond[j] != ']') {
4811
9.31k
          if (cond[j] != strip[i]) {
4812
6.47k
            HUNSPELL_WARNING(stderr,
4813
6.47k
                             "warning: line %d: incompatible stripping "
4814
6.47k
                             "characters and condition\n",
4815
6.47k
                             linenum);
4816
6.47k
            return 0;
4817
6.47k
          }
4818
9.31k
        } else if (j > 0) {
4819
7.22k
          in = 0;
4820
222k
          do {
4821
222k
            j--;
4822
222k
            if (strip[i] == cond[j])
4823
8.09k
              in = 1;
4824
222k
          } while ((j > 0) && (cond[j] != '['));
4825
7.22k
          if ((j == 0) && (cond[j] != '[')) {
4826
1.93k
            HUNSPELL_WARNING(stderr,
4827
1.93k
                             "error: line: %d: missing ] in condition:\n%s\n",
4828
1.93k
                             linenum, cond.c_str());
4829
1.93k
            return 0;
4830
1.93k
          }
4831
5.28k
          neg = (cond[j + 1] == '^') ? 1 : 0;
4832
5.28k
          if ((!neg && !in) || (neg && in)) {
4833
3.19k
            HUNSPELL_WARNING(stderr,
4834
3.19k
                             "warning: line %d: incompatible stripping "
4835
3.19k
                             "characters and condition\n",
4836
3.19k
                             linenum);
4837
3.19k
            return 0;
4838
3.19k
          }
4839
5.28k
        }
4840
16.7k
      }
4841
1.01k
      if (j < 0)
4842
660
        return 1;
4843
1.01k
    }
4844
15.5k
  }
4845
5.69k
  return 0;
4846
27.3k
}
4847
4848
std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4849
                               int len,
4850
0
                               const std::string& root_word) {
4851
0
  std::vector<std::string> slst;
4852
0
  short unsigned* start_ptr = suff;
4853
0
  for (auto ptr : sStart) {
4854
0
    while (ptr) {
4855
0
      suff = start_ptr;
4856
0
      for (int i = 0; i < len; i++) {
4857
0
        if ((*suff) == ptr->getFlag()) {
4858
0
          std::string nw(root_word);
4859
0
          nw.append(ptr->getAffix());
4860
0
          hentry* ht = ptr->checkword(nw, 0, nw.size(), 0, NULL, 0, 0, 0);
4861
0
          if (ht) {
4862
0
            slst.push_back(nw);
4863
0
          }
4864
0
        }
4865
0
        suff++;
4866
0
      }
4867
0
      ptr = ptr->getNext();
4868
0
    }
4869
0
  }
4870
0
  return slst;
4871
0
}