/src/hunspell/src/hunspell/affixmgr.cxx
Line | Count | Source (jump to first uncovered line) |
1 | | /* ***** BEGIN LICENSE BLOCK ***** |
2 | | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
3 | | * |
4 | | * Copyright (C) 2002-2022 Németh László |
5 | | * |
6 | | * The contents of this file are subject to the Mozilla Public License Version |
7 | | * 1.1 (the "License"); you may not use this file except in compliance with |
8 | | * the License. You may obtain a copy of the License at |
9 | | * http://www.mozilla.org/MPL/ |
10 | | * |
11 | | * Software distributed under the License is distributed on an "AS IS" basis, |
12 | | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
13 | | * for the specific language governing rights and limitations under the |
14 | | * License. |
15 | | * |
16 | | * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. |
17 | | * |
18 | | * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
19 | | * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
20 | | * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
21 | | * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
22 | | * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
23 | | * |
24 | | * Alternatively, the contents of this file may be used under the terms of |
25 | | * either the GNU General Public License Version 2 or later (the "GPL"), or |
26 | | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
27 | | * in which case the provisions of the GPL or the LGPL are applicable instead |
28 | | * of those above. If you wish to allow use of your version of this file only |
29 | | * under the terms of either the GPL or the LGPL, and not to allow others to |
30 | | * use your version of this file under the terms of the MPL, indicate your |
31 | | * decision by deleting the provisions above and replace them with the notice |
32 | | * and other provisions required by the GPL or the LGPL. If you do not delete |
33 | | * the provisions above, a recipient may use your version of this file under |
34 | | * the terms of any one of the MPL, the GPL or the LGPL. |
35 | | * |
36 | | * ***** END LICENSE BLOCK ***** */ |
37 | | /* |
38 | | * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada |
39 | | * And Contributors. All rights reserved. |
40 | | * |
41 | | * Redistribution and use in source and binary forms, with or without |
42 | | * modification, are permitted provided that the following conditions |
43 | | * are met: |
44 | | * |
45 | | * 1. Redistributions of source code must retain the above copyright |
46 | | * notice, this list of conditions and the following disclaimer. |
47 | | * |
48 | | * 2. Redistributions in binary form must reproduce the above copyright |
49 | | * notice, this list of conditions and the following disclaimer in the |
50 | | * documentation and/or other materials provided with the distribution. |
51 | | * |
52 | | * 3. All modifications to the source code must be clearly marked as |
53 | | * such. Binary redistributions based on modified source code |
54 | | * must be clearly marked as modified versions in the documentation |
55 | | * and/or other materials provided with the distribution. |
56 | | * |
57 | | * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS |
58 | | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
59 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
60 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
61 | | * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
62 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
63 | | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
64 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
65 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
66 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
67 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
68 | | * SUCH DAMAGE. |
69 | | */ |
70 | | |
71 | | #include <cstdlib> |
72 | | #include <cstring> |
73 | | #include <cstdio> |
74 | | #include <cctype> |
75 | | #include <ctime> |
76 | | |
77 | | #include <algorithm> |
78 | | #include <chrono> |
79 | | #include <memory> |
80 | | #include <limits> |
81 | | #include <string> |
82 | | #include <vector> |
83 | | |
84 | | #include "affixmgr.hxx" |
85 | | #include "affentry.hxx" |
86 | | #include "langnum.hxx" |
87 | | |
88 | | #include "csutil.hxx" |
89 | | |
90 | | AffixMgr::AffixMgr(const char* affpath, |
91 | | const std::vector<HashMgr*>& ptr, |
92 | | const char* key) |
93 | 18.3k | : alldic(ptr) |
94 | 18.3k | , pHMgr(ptr[0]) { |
95 | | |
96 | | // register hash manager and load affix data from aff file |
97 | 18.3k | csconv = NULL; |
98 | 18.3k | utf8 = 0; |
99 | 18.3k | complexprefixes = 0; |
100 | 18.3k | parsedmaptable = false; |
101 | 18.3k | parsedbreaktable = false; |
102 | 18.3k | iconvtable = NULL; |
103 | 18.3k | oconvtable = NULL; |
104 | | // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) |
105 | 18.3k | simplifiedcpd = 0; |
106 | 18.3k | parsedcheckcpd = false; |
107 | 18.3k | parseddefcpd = false; |
108 | 18.3k | phone = NULL; |
109 | 18.3k | compoundflag = FLAG_NULL; // permits word in compound forms |
110 | 18.3k | compoundbegin = FLAG_NULL; // may be first word in compound forms |
111 | 18.3k | compoundmiddle = FLAG_NULL; // may be middle word in compound forms |
112 | 18.3k | compoundend = FLAG_NULL; // may be last word in compound forms |
113 | 18.3k | compoundroot = FLAG_NULL; // compound word signing flag |
114 | 18.3k | compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word |
115 | 18.3k | compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word |
116 | 18.3k | compoundmoresuffixes = 0; // allow more suffixes within compound words |
117 | 18.3k | checkcompounddup = 0; // forbid double words in compounds |
118 | 18.3k | checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with |
119 | | // a REP substitution) |
120 | 18.3k | checkcompoundcase = |
121 | 18.3k | 0; // forbid upper and lowercase combinations at word bounds |
122 | 18.3k | checkcompoundtriple = 0; // forbid compounds with triple letters |
123 | 18.3k | simplifiedtriple = 0; // allow simplified triple letters in compounds |
124 | | // (Schiff+fahrt -> Schiffahrt) |
125 | 18.3k | forbiddenword = FORBIDDENWORD; // forbidden word signing flag |
126 | 18.3k | nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag |
127 | 18.3k | nongramsuggest = FLAG_NULL; |
128 | 18.3k | langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) |
129 | 18.3k | needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes |
130 | 18.3k | cpdwordmax = -1; // default: unlimited wordcount in compound words |
131 | 18.3k | cpdmin = -1; // undefined |
132 | 18.3k | cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words |
133 | 18.3k | pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG |
134 | 18.3k | sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG |
135 | 18.3k | sfxextra = 0; // modifier for syllable count of sfxappnd BUG |
136 | 18.3k | checknum = 0; // checking numbers, and word with numbers |
137 | 18.3k | havecontclass = 0; // flags of possible continuing classes (double affix) |
138 | | // LEMMA_PRESENT: not put root into the morphological output. Lemma presents |
139 | | // in morhological description in dictionary file. It's often combined with |
140 | | // PSEUDOROOT. |
141 | 18.3k | lemma_present = FLAG_NULL; |
142 | 18.3k | circumfix = FLAG_NULL; |
143 | 18.3k | onlyincompound = FLAG_NULL; |
144 | 18.3k | maxngramsugs = -1; // undefined |
145 | 18.3k | maxdiff = -1; // undefined |
146 | 18.3k | onlymaxdiff = 0; |
147 | 18.3k | maxcpdsugs = -1; // undefined |
148 | 18.3k | nosplitsugs = 0; |
149 | 18.3k | sugswithdots = 0; |
150 | 18.3k | keepcase = 0; |
151 | 18.3k | forceucase = 0; |
152 | 18.3k | warn = 0; |
153 | 18.3k | forbidwarn = 0; |
154 | 18.3k | checksharps = 0; |
155 | 18.3k | substandard = FLAG_NULL; |
156 | 18.3k | fullstrip = 0; |
157 | | |
158 | 18.3k | sfx = NULL; |
159 | 18.3k | pfx = NULL; |
160 | | |
161 | 4.71M | for (int i = 0; i < SETSIZE; i++) { |
162 | 4.69M | pStart[i] = NULL; |
163 | 4.69M | sStart[i] = NULL; |
164 | 4.69M | pFlag[i] = NULL; |
165 | 4.69M | sFlag[i] = NULL; |
166 | 4.69M | } |
167 | | |
168 | 18.3k | memset(contclasses, 0, CONTSIZE * sizeof(char)); |
169 | | |
170 | 18.3k | if (parse_file(affpath, key)) { |
171 | 6.30k | HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath); |
172 | 6.30k | } |
173 | | |
174 | | /* get encoding for CHECKCOMPOUNDCASE */ |
175 | 18.3k | if (!utf8) { |
176 | 13.9k | csconv = get_current_cs(get_encoding()); |
177 | 3.58M | for (int i = 0; i <= 255; i++) { |
178 | 3.57M | if ((csconv[i].cupper != csconv[i].clower) && |
179 | 3.57M | (wordchars.find((char)i) == std::string::npos)) { |
180 | 1.56M | wordchars.push_back((char)i); |
181 | 1.56M | } |
182 | 3.57M | } |
183 | 13.9k | } |
184 | | |
185 | | // default BREAK definition |
186 | 18.3k | if (!parsedbreaktable) { |
187 | 16.4k | breaktable.emplace_back("-"); |
188 | 16.4k | breaktable.emplace_back("^-"); |
189 | 16.4k | breaktable.emplace_back("-$"); |
190 | 16.4k | parsedbreaktable = true; |
191 | 16.4k | } |
192 | | |
193 | 18.3k | #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) |
194 | | // not entirely sure this is invalid, so only for fuzzing for now |
195 | 18.3k | if (iconvtable && !iconvtable->check_against_breaktable(breaktable)) { |
196 | 124 | delete iconvtable; |
197 | 124 | iconvtable = nullptr; |
198 | 124 | } |
199 | 18.3k | #endif |
200 | | |
201 | 18.3k | if (cpdmin == -1) |
202 | 17.7k | cpdmin = MINCPDLEN; |
203 | 18.3k | } |
204 | | |
205 | 18.3k | AffixMgr::~AffixMgr() { |
206 | | // pass through linked prefix entries and clean up |
207 | 4.71M | for (int i = 0; i < SETSIZE; i++) { |
208 | 4.69M | pFlag[i] = NULL; |
209 | 4.69M | PfxEntry* ptr = pStart[i]; |
210 | 4.69M | PfxEntry* nptr = NULL; |
211 | 4.71M | while (ptr) { |
212 | 18.5k | nptr = ptr->getNext(); |
213 | 18.5k | delete (ptr); |
214 | 18.5k | ptr = nptr; |
215 | 18.5k | nptr = NULL; |
216 | 18.5k | } |
217 | 4.69M | } |
218 | | |
219 | | // pass through linked suffix entries and clean up |
220 | 4.71M | for (int j = 0; j < SETSIZE; j++) { |
221 | 4.69M | sFlag[j] = NULL; |
222 | 4.69M | SfxEntry* ptr = sStart[j]; |
223 | 4.69M | SfxEntry* nptr = NULL; |
224 | 4.72M | while (ptr) { |
225 | 27.1k | nptr = ptr->getNext(); |
226 | 27.1k | delete (ptr); |
227 | 27.1k | ptr = nptr; |
228 | 27.1k | nptr = NULL; |
229 | 27.1k | } |
230 | 4.69M | sStart[j] = NULL; |
231 | 4.69M | } |
232 | | |
233 | 18.3k | delete iconvtable; |
234 | 18.3k | delete oconvtable; |
235 | 18.3k | delete phone; |
236 | | |
237 | 18.3k | FREE_FLAG(compoundflag); |
238 | 18.3k | FREE_FLAG(compoundbegin); |
239 | 18.3k | FREE_FLAG(compoundmiddle); |
240 | 18.3k | FREE_FLAG(compoundend); |
241 | 18.3k | FREE_FLAG(compoundpermitflag); |
242 | 18.3k | FREE_FLAG(compoundforbidflag); |
243 | 18.3k | FREE_FLAG(compoundroot); |
244 | 18.3k | FREE_FLAG(forbiddenword); |
245 | 18.3k | FREE_FLAG(nosuggest); |
246 | 18.3k | FREE_FLAG(nongramsuggest); |
247 | 18.3k | FREE_FLAG(needaffix); |
248 | 18.3k | FREE_FLAG(lemma_present); |
249 | 18.3k | FREE_FLAG(circumfix); |
250 | 18.3k | FREE_FLAG(onlyincompound); |
251 | | |
252 | 18.3k | cpdwordmax = 0; |
253 | 18.3k | pHMgr = NULL; |
254 | 18.3k | cpdmin = 0; |
255 | 18.3k | cpdmaxsyllable = 0; |
256 | 18.3k | checknum = 0; |
257 | | #ifdef MOZILLA_CLIENT |
258 | | delete[] csconv; |
259 | | #endif |
260 | 18.3k | } |
261 | | |
262 | 18.3k | void AffixMgr::finishFileMgr(FileMgr* afflst) { |
263 | 18.3k | delete afflst; |
264 | | |
265 | | // convert affix trees to sorted list |
266 | 18.3k | process_pfx_tree_to_list(); |
267 | 18.3k | process_sfx_tree_to_list(); |
268 | 18.3k | } |
269 | | |
270 | | // read in aff file and build up prefix and suffix entry objects |
271 | 18.3k | int AffixMgr::parse_file(const char* affpath, const char* key) { |
272 | | |
273 | | // checking flag duplication |
274 | 18.3k | char dupflags[CONTSIZE]; |
275 | 18.3k | char dupflags_ini = 1; |
276 | | |
277 | | // first line indicator for removing byte order mark |
278 | 18.3k | int firstline = 1; |
279 | | |
280 | | // open the affix file |
281 | 18.3k | FileMgr* afflst = new FileMgr(affpath, key); |
282 | 18.3k | if (!afflst) { |
283 | 0 | HUNSPELL_WARNING( |
284 | 0 | stderr, "error: could not open affix description file %s\n", affpath); |
285 | 0 | return 1; |
286 | 0 | } |
287 | | |
288 | | // step one is to parse the affix file building up the internal |
289 | | // affix data structures |
290 | | |
291 | | // read in each line ignoring any that do not |
292 | | // start with a known line type indicator |
293 | 18.3k | std::string line; |
294 | 587k | while (afflst->getline(line)) { |
295 | 575k | mychomp(line); |
296 | | |
297 | | /* remove byte order mark */ |
298 | 575k | if (firstline) { |
299 | 18.0k | firstline = 0; |
300 | | // Affix file begins with byte order mark: possible incompatibility with |
301 | | // old Hunspell versions |
302 | 18.0k | if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { |
303 | 1 | line.erase(0, 3); |
304 | 1 | } |
305 | 18.0k | } |
306 | | |
307 | | /* parse in the keyboard string */ |
308 | 575k | if (line.compare(0, 3, "KEY", 3) == 0) { |
309 | 197 | if (!parse_string(line, keystring, afflst->getlinenum())) { |
310 | 6 | finishFileMgr(afflst); |
311 | 6 | return 1; |
312 | 6 | } |
313 | 197 | } |
314 | | |
315 | | /* parse in the try string */ |
316 | 575k | if (line.compare(0, 3, "TRY", 3) == 0) { |
317 | 1.05k | if (!parse_string(line, trystring, afflst->getlinenum())) { |
318 | 26 | finishFileMgr(afflst); |
319 | 26 | return 1; |
320 | 26 | } |
321 | 1.05k | } |
322 | | |
323 | | /* parse in the name of the character set used by the .dict and .aff */ |
324 | 575k | if (line.compare(0, 3, "SET", 3) == 0) { |
325 | 5.46k | if (!parse_string(line, encoding, afflst->getlinenum())) { |
326 | 153 | finishFileMgr(afflst); |
327 | 153 | return 1; |
328 | 153 | } |
329 | 5.30k | if (encoding == "UTF-8") { |
330 | 4.40k | utf8 = 1; |
331 | 4.40k | } |
332 | 5.30k | } |
333 | | |
334 | | /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left |
335 | | * writing system */ |
336 | 575k | if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) |
337 | 5.04k | complexprefixes = 1; |
338 | | |
339 | | /* parse in the flag used by the controlled compound words */ |
340 | 575k | if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) { |
341 | 3.30k | if (!parse_flag(line, &compoundflag, afflst)) { |
342 | 113 | finishFileMgr(afflst); |
343 | 113 | return 1; |
344 | 113 | } |
345 | 3.30k | } |
346 | | |
347 | | /* parse in the flag used by compound words */ |
348 | 575k | if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) { |
349 | 1.22k | if (complexprefixes) { |
350 | 518 | if (!parse_flag(line, &compoundend, afflst)) { |
351 | 8 | finishFileMgr(afflst); |
352 | 8 | return 1; |
353 | 8 | } |
354 | 710 | } else { |
355 | 710 | if (!parse_flag(line, &compoundbegin, afflst)) { |
356 | 3 | finishFileMgr(afflst); |
357 | 3 | return 1; |
358 | 3 | } |
359 | 710 | } |
360 | 1.22k | } |
361 | | |
362 | | /* parse in the flag used by compound words */ |
363 | 575k | if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) { |
364 | 317 | if (!parse_flag(line, &compoundmiddle, afflst)) { |
365 | 11 | finishFileMgr(afflst); |
366 | 11 | return 1; |
367 | 11 | } |
368 | 317 | } |
369 | | |
370 | | /* parse in the flag used by compound words */ |
371 | 575k | if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) { |
372 | 950 | if (complexprefixes) { |
373 | 475 | if (!parse_flag(line, &compoundbegin, afflst)) { |
374 | 18 | finishFileMgr(afflst); |
375 | 18 | return 1; |
376 | 18 | } |
377 | 475 | } else { |
378 | 475 | if (!parse_flag(line, &compoundend, afflst)) { |
379 | 10 | finishFileMgr(afflst); |
380 | 10 | return 1; |
381 | 10 | } |
382 | 475 | } |
383 | 950 | } |
384 | | |
385 | | /* parse in the data used by compound_check() method */ |
386 | 575k | if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) { |
387 | 280 | if (!parse_num(line, &cpdwordmax, afflst)) { |
388 | 9 | finishFileMgr(afflst); |
389 | 9 | return 1; |
390 | 9 | } |
391 | 280 | } |
392 | | |
393 | | /* parse in the flag sign compounds in dictionary */ |
394 | 574k | if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) { |
395 | 407 | if (!parse_flag(line, &compoundroot, afflst)) { |
396 | 5 | finishFileMgr(afflst); |
397 | 5 | return 1; |
398 | 5 | } |
399 | 407 | } |
400 | | |
401 | | /* parse in the flag used by compound_check() method */ |
402 | 574k | if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) { |
403 | 446 | if (!parse_flag(line, &compoundpermitflag, afflst)) { |
404 | 8 | finishFileMgr(afflst); |
405 | 8 | return 1; |
406 | 8 | } |
407 | 446 | } |
408 | | |
409 | | /* parse in the flag used by compound_check() method */ |
410 | 574k | if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) { |
411 | 376 | if (!parse_flag(line, &compoundforbidflag, afflst)) { |
412 | 5 | finishFileMgr(afflst); |
413 | 5 | return 1; |
414 | 5 | } |
415 | 376 | } |
416 | | |
417 | 574k | if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) { |
418 | 2.20k | compoundmoresuffixes = 1; |
419 | 2.20k | } |
420 | | |
421 | 574k | if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) { |
422 | 510 | checkcompounddup = 1; |
423 | 510 | } |
424 | | |
425 | 574k | if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) { |
426 | 419 | checkcompoundrep = 1; |
427 | 419 | } |
428 | | |
429 | 574k | if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) { |
430 | 168 | checkcompoundtriple = 1; |
431 | 168 | } |
432 | | |
433 | 574k | if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) { |
434 | 196 | simplifiedtriple = 1; |
435 | 196 | } |
436 | | |
437 | 574k | if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) { |
438 | 461 | checkcompoundcase = 1; |
439 | 461 | } |
440 | | |
441 | 574k | if (line.compare(0, 9, "NOSUGGEST", 9) == 0) { |
442 | 260 | if (!parse_flag(line, &nosuggest, afflst)) { |
443 | 13 | finishFileMgr(afflst); |
444 | 13 | return 1; |
445 | 13 | } |
446 | 260 | } |
447 | | |
448 | 574k | if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) { |
449 | 380 | if (!parse_flag(line, &nongramsuggest, afflst)) { |
450 | 10 | finishFileMgr(afflst); |
451 | 10 | return 1; |
452 | 10 | } |
453 | 380 | } |
454 | | |
455 | | /* parse in the flag used by forbidden words */ |
456 | 574k | if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { |
457 | 2.08k | if (!parse_flag(line, &forbiddenword, afflst)) { |
458 | 41 | finishFileMgr(afflst); |
459 | 41 | return 1; |
460 | 41 | } |
461 | 2.08k | } |
462 | | |
463 | | /* parse in the flag used by forbidden words (is deprecated) */ |
464 | 574k | if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) { |
465 | 79 | if (!parse_flag(line, &lemma_present, afflst)) { |
466 | 2 | finishFileMgr(afflst); |
467 | 2 | return 1; |
468 | 2 | } |
469 | 79 | } |
470 | | |
471 | | /* parse in the flag used by circumfixes */ |
472 | 574k | if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) { |
473 | 585 | if (!parse_flag(line, &circumfix, afflst)) { |
474 | 13 | finishFileMgr(afflst); |
475 | 13 | return 1; |
476 | 13 | } |
477 | 585 | } |
478 | | |
479 | | /* parse in the flag used by fogemorphemes */ |
480 | 574k | if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) { |
481 | 622 | if (!parse_flag(line, &onlyincompound, afflst)) { |
482 | 25 | finishFileMgr(afflst); |
483 | 25 | return 1; |
484 | 25 | } |
485 | 622 | } |
486 | | |
487 | | /* parse in the flag used by `needaffixs' (is deprecated) */ |
488 | 574k | if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) { |
489 | 172 | if (!parse_flag(line, &needaffix, afflst)) { |
490 | 13 | finishFileMgr(afflst); |
491 | 13 | return 1; |
492 | 13 | } |
493 | 172 | } |
494 | | |
495 | | /* parse in the flag used by `needaffixs' */ |
496 | 574k | if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) { |
497 | 848 | if (!parse_flag(line, &needaffix, afflst)) { |
498 | 23 | finishFileMgr(afflst); |
499 | 23 | return 1; |
500 | 23 | } |
501 | 848 | } |
502 | | |
503 | | /* parse in the minimal length for words in compounds */ |
504 | 574k | if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) { |
505 | 673 | if (!parse_num(line, &cpdmin, afflst)) { |
506 | 27 | finishFileMgr(afflst); |
507 | 27 | return 1; |
508 | 27 | } |
509 | 646 | if (cpdmin < 1) |
510 | 556 | cpdmin = 1; |
511 | 646 | } |
512 | | |
513 | | /* parse in the max. words and syllables in compounds */ |
514 | 574k | if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) { |
515 | 51.4k | if (!parse_cpdsyllable(line, afflst)) { |
516 | 7 | finishFileMgr(afflst); |
517 | 7 | return 1; |
518 | 7 | } |
519 | 51.4k | } |
520 | | |
521 | | /* parse in the flag used by compound_check() method */ |
522 | 574k | if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) { |
523 | 49 | if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) { |
524 | 20 | finishFileMgr(afflst); |
525 | 20 | return 1; |
526 | 20 | } |
527 | 49 | } |
528 | | |
529 | | /* parse in the flag used by the controlled compound words */ |
530 | 574k | if (line.compare(0, 8, "CHECKNUM", 8) == 0) { |
531 | 177 | checknum = 1; |
532 | 177 | } |
533 | | |
534 | | /* parse in the extra word characters */ |
535 | 574k | if (line.compare(0, 9, "WORDCHARS", 9) == 0) { |
536 | 280 | if (!parse_array(line, wordchars, wordchars_utf16, |
537 | 280 | utf8, afflst->getlinenum())) { |
538 | 9 | finishFileMgr(afflst); |
539 | 9 | return 1; |
540 | 9 | } |
541 | 280 | } |
542 | | |
543 | | /* parse in the ignored characters (for example, Arabic optional diacretics |
544 | | * charachters */ |
545 | 574k | if (line.compare(0, 6, "IGNORE", 6) == 0) { |
546 | 1.43k | if (!parse_array(line, ignorechars, ignorechars_utf16, |
547 | 1.43k | utf8, afflst->getlinenum())) { |
548 | 35 | finishFileMgr(afflst); |
549 | 35 | return 1; |
550 | 35 | } |
551 | 1.43k | } |
552 | | |
553 | | /* parse in the input conversion table */ |
554 | 574k | if (line.compare(0, 5, "ICONV", 5) == 0) { |
555 | 2.38k | if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) { |
556 | 881 | finishFileMgr(afflst); |
557 | 881 | return 1; |
558 | 881 | } |
559 | 2.38k | } |
560 | | |
561 | | /* parse in the output conversion table */ |
562 | 573k | if (line.compare(0, 5, "OCONV", 5) == 0) { |
563 | 168 | if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) { |
564 | 115 | finishFileMgr(afflst); |
565 | 115 | return 1; |
566 | 115 | } |
567 | 168 | } |
568 | | |
569 | | /* parse in the phonetic translation table */ |
570 | 573k | if (line.compare(0, 5, "PHONE", 5) == 0) { |
571 | 1.33k | if (!parse_phonetable(line, afflst)) { |
572 | 320 | finishFileMgr(afflst); |
573 | 320 | return 1; |
574 | 320 | } |
575 | 1.33k | } |
576 | | |
577 | | /* parse in the checkcompoundpattern table */ |
578 | 573k | if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) { |
579 | 869 | if (!parse_checkcpdtable(line, afflst)) { |
580 | 802 | finishFileMgr(afflst); |
581 | 802 | return 1; |
582 | 802 | } |
583 | 869 | } |
584 | | |
585 | | /* parse in the defcompound table */ |
586 | 572k | if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) { |
587 | 933 | if (!parse_defcpdtable(line, afflst)) { |
588 | 863 | finishFileMgr(afflst); |
589 | 863 | return 1; |
590 | 863 | } |
591 | 933 | } |
592 | | |
593 | | /* parse in the related character map table */ |
594 | 571k | if (line.compare(0, 3, "MAP", 3) == 0) { |
595 | 731 | if (!parse_maptable(line, afflst)) { |
596 | 418 | finishFileMgr(afflst); |
597 | 418 | return 1; |
598 | 418 | } |
599 | 731 | } |
600 | | |
601 | | /* parse in the word breakpoints table */ |
602 | 571k | if (line.compare(0, 5, "BREAK", 5) == 0) { |
603 | 1.94k | if (!parse_breaktable(line, afflst)) { |
604 | 411 | finishFileMgr(afflst); |
605 | 411 | return 1; |
606 | 411 | } |
607 | 1.94k | } |
608 | | |
609 | | /* parse in the language for language specific codes */ |
610 | 570k | if (line.compare(0, 4, "LANG", 4) == 0) { |
611 | 1.62k | if (!parse_string(line, lang, afflst->getlinenum())) { |
612 | 37 | finishFileMgr(afflst); |
613 | 37 | return 1; |
614 | 37 | } |
615 | 1.59k | langnum = get_lang_num(lang); |
616 | 1.59k | } |
617 | | |
618 | 570k | if (line.compare(0, 7, "VERSION", 7) == 0) { |
619 | 3.75k | size_t startpos = line.find_first_not_of(" \t", 7); |
620 | 3.75k | if (startpos != std::string::npos) { |
621 | 3.10k | version = line.substr(startpos); |
622 | 3.10k | } |
623 | 3.75k | } |
624 | | |
625 | 570k | if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) { |
626 | 400 | if (!parse_num(line, &maxngramsugs, afflst)) { |
627 | 17 | finishFileMgr(afflst); |
628 | 17 | return 1; |
629 | 17 | } |
630 | 400 | } |
631 | | |
632 | 570k | if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0) |
633 | 115 | onlymaxdiff = 1; |
634 | | |
635 | 570k | if (line.compare(0, 7, "MAXDIFF", 7) == 0) { |
636 | 783 | if (!parse_num(line, &maxdiff, afflst)) { |
637 | 11 | finishFileMgr(afflst); |
638 | 11 | return 1; |
639 | 11 | } |
640 | 783 | } |
641 | | |
642 | 570k | if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) { |
643 | 179 | if (!parse_num(line, &maxcpdsugs, afflst)) { |
644 | 13 | finishFileMgr(afflst); |
645 | 13 | return 1; |
646 | 13 | } |
647 | 179 | } |
648 | | |
649 | 570k | if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) { |
650 | 2.90k | nosplitsugs = 1; |
651 | 2.90k | } |
652 | | |
653 | 570k | if (line.compare(0, 9, "FULLSTRIP", 9) == 0) { |
654 | 378 | fullstrip = 1; |
655 | 378 | } |
656 | | |
657 | 570k | if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) { |
658 | 344 | sugswithdots = 1; |
659 | 344 | } |
660 | | |
661 | | /* parse in the flag used by forbidden words */ |
662 | 570k | if (line.compare(0, 8, "KEEPCASE", 8) == 0) { |
663 | 258 | if (!parse_flag(line, &keepcase, afflst)) { |
664 | 5 | finishFileMgr(afflst); |
665 | 5 | return 1; |
666 | 5 | } |
667 | 258 | } |
668 | | |
669 | | /* parse in the flag used by `forceucase' */ |
670 | 570k | if (line.compare(0, 10, "FORCEUCASE", 10) == 0) { |
671 | 397 | if (!parse_flag(line, &forceucase, afflst)) { |
672 | 16 | finishFileMgr(afflst); |
673 | 16 | return 1; |
674 | 16 | } |
675 | 397 | } |
676 | | |
677 | | /* parse in the flag used by `warn' */ |
678 | 570k | if (line.compare(0, 4, "WARN", 4) == 0) { |
679 | 291 | if (!parse_flag(line, &warn, afflst)) { |
680 | 36 | finishFileMgr(afflst); |
681 | 36 | return 1; |
682 | 36 | } |
683 | 291 | } |
684 | | |
685 | 570k | if (line.compare(0, 10, "FORBIDWARN", 10) == 0) { |
686 | 94 | forbidwarn = 1; |
687 | 94 | } |
688 | | |
689 | | /* parse in the flag used by the affix generator */ |
690 | 570k | if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) { |
691 | 107 | if (!parse_flag(line, &substandard, afflst)) { |
692 | 5 | finishFileMgr(afflst); |
693 | 5 | return 1; |
694 | 5 | } |
695 | 107 | } |
696 | | |
697 | 570k | if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) { |
698 | 1.01k | checksharps = 1; |
699 | 1.01k | } |
700 | | |
701 | | /* parse this affix: P - prefix, S - suffix */ |
702 | | // affix type |
703 | 570k | char ft = ' '; |
704 | 570k | if (line.compare(0, 3, "PFX", 3) == 0) |
705 | 15.0k | ft = complexprefixes ? 'S' : 'P'; |
706 | 570k | if (line.compare(0, 3, "SFX", 3) == 0) |
707 | 20.0k | ft = complexprefixes ? 'P' : 'S'; |
708 | 570k | if (ft != ' ') { |
709 | 35.1k | if (dupflags_ini) { |
710 | 7.26k | memset(dupflags, 0, sizeof(dupflags)); |
711 | 7.26k | dupflags_ini = 0; |
712 | 7.26k | } |
713 | 35.1k | if (!parse_affix(line, ft, afflst, dupflags)) { |
714 | 1.74k | finishFileMgr(afflst); |
715 | 1.74k | return 1; |
716 | 1.74k | } |
717 | 35.1k | } |
718 | 570k | } |
719 | | |
720 | 12.0k | finishFileMgr(afflst); |
721 | | // affix trees are sorted now |
722 | | |
723 | | // now we can speed up performance greatly taking advantage of the |
724 | | // relationship between the affixes and the idea of "subsets". |
725 | | |
726 | | // View each prefix as a potential leading subset of another and view |
727 | | // each suffix (reversed) as a potential trailing subset of another. |
728 | | |
729 | | // To illustrate this relationship if we know the prefix "ab" is found in the |
730 | | // word to examine, only prefixes that "ab" is a leading subset of need be |
731 | | // examined. |
732 | | // Furthermore is "ab" is not present then none of the prefixes that "ab" is |
733 | | // is a subset need be examined. |
734 | | // The same argument goes for suffix string that are reversed. |
735 | | |
736 | | // Then to top this off why not examine the first char of the word to quickly |
737 | | // limit the set of prefixes to examine (i.e. the prefixes to examine must |
738 | | // be leading supersets of the first character of the word (if they exist) |
739 | | |
740 | | // To take advantage of this "subset" relationship, we need to add two links |
741 | | // from entry. One to take next if the current prefix is found (call it |
742 | | // nexteq) |
743 | | // and one to take next if the current prefix is not found (call it nextne). |
744 | | |
745 | | // Since we have built ordered lists, all that remains is to properly |
746 | | // initialize |
747 | | // the nextne and nexteq pointers that relate them |
748 | | |
749 | 12.0k | process_pfx_order(); |
750 | 12.0k | process_sfx_order(); |
751 | | |
752 | 12.0k | return 0; |
753 | 18.3k | } |
754 | | |
755 | | // we want to be able to quickly access prefix information |
756 | | // both by prefix flag, and sorted by prefix string itself |
757 | | // so we need to set up two indexes |
758 | | |
759 | 18.5k | int AffixMgr::build_pfxtree(PfxEntry* pfxptr) { |
760 | 18.5k | PfxEntry* ptr; |
761 | 18.5k | PfxEntry* pptr; |
762 | 18.5k | PfxEntry* ep = pfxptr; |
763 | | |
764 | | // get the right starting points |
765 | 18.5k | const char* key = ep->getKey(); |
766 | 18.5k | const auto flg = (unsigned char)(ep->getFlag() & 0x00FF); |
767 | | |
768 | | // first index by flag which must exist |
769 | 18.5k | ptr = pFlag[flg]; |
770 | 18.5k | ep->setFlgNxt(ptr); |
771 | 18.5k | pFlag[flg] = ep; |
772 | | |
773 | | // handle the special case of null affix string |
774 | 18.5k | if (*key == '\0') { |
775 | | // always inset them at head of list at element 0 |
776 | 7.02k | ptr = pStart[0]; |
777 | 7.02k | ep->setNext(ptr); |
778 | 7.02k | pStart[0] = ep; |
779 | 7.02k | return 0; |
780 | 7.02k | } |
781 | | |
782 | | // now handle the normal case |
783 | 11.5k | ep->setNextEQ(NULL); |
784 | 11.5k | ep->setNextNE(NULL); |
785 | | |
786 | 11.5k | unsigned char sp = *((const unsigned char*)key); |
787 | 11.5k | ptr = pStart[sp]; |
788 | | |
789 | | // handle the first insert |
790 | 11.5k | if (!ptr) { |
791 | 2.63k | pStart[sp] = ep; |
792 | 2.63k | return 0; |
793 | 2.63k | } |
794 | | |
795 | | // otherwise use binary tree insertion so that a sorted |
796 | | // list can easily be generated later |
797 | 8.86k | pptr = NULL; |
798 | 294k | for (;;) { |
799 | 294k | pptr = ptr; |
800 | 294k | if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { |
801 | 290k | ptr = ptr->getNextEQ(); |
802 | 290k | if (!ptr) { |
803 | 8.14k | pptr->setNextEQ(ep); |
804 | 8.14k | break; |
805 | 8.14k | } |
806 | 290k | } else { |
807 | 4.70k | ptr = ptr->getNextNE(); |
808 | 4.70k | if (!ptr) { |
809 | 717 | pptr->setNextNE(ep); |
810 | 717 | break; |
811 | 717 | } |
812 | 4.70k | } |
813 | 294k | } |
814 | 8.86k | return 0; |
815 | 11.5k | } |
816 | | |
817 | | // we want to be able to quickly access suffix information |
818 | | // both by suffix flag, and sorted by the reverse of the |
819 | | // suffix string itself; so we need to set up two indexes |
820 | 27.1k | int AffixMgr::build_sfxtree(SfxEntry* sfxptr) { |
821 | | |
822 | 27.1k | sfxptr->initReverseWord(); |
823 | | |
824 | 27.1k | SfxEntry* ptr; |
825 | 27.1k | SfxEntry* pptr; |
826 | 27.1k | SfxEntry* ep = sfxptr; |
827 | | |
828 | | /* get the right starting point */ |
829 | 27.1k | const char* key = ep->getKey(); |
830 | 27.1k | const auto flg = (unsigned char)(ep->getFlag() & 0x00FF); |
831 | | |
832 | | // first index by flag which must exist |
833 | 27.1k | ptr = sFlag[flg]; |
834 | 27.1k | ep->setFlgNxt(ptr); |
835 | 27.1k | sFlag[flg] = ep; |
836 | | |
837 | | // next index by affix string |
838 | | |
839 | | // handle the special case of null affix string |
840 | 27.1k | if (*key == '\0') { |
841 | | // always inset them at head of list at element 0 |
842 | 13.4k | ptr = sStart[0]; |
843 | 13.4k | ep->setNext(ptr); |
844 | 13.4k | sStart[0] = ep; |
845 | 13.4k | return 0; |
846 | 13.4k | } |
847 | | |
848 | | // now handle the normal case |
849 | 13.7k | ep->setNextEQ(NULL); |
850 | 13.7k | ep->setNextNE(NULL); |
851 | | |
852 | 13.7k | unsigned char sp = *((const unsigned char*)key); |
853 | 13.7k | ptr = sStart[sp]; |
854 | | |
855 | | // handle the first insert |
856 | 13.7k | if (!ptr) { |
857 | 2.87k | sStart[sp] = ep; |
858 | 2.87k | return 0; |
859 | 2.87k | } |
860 | | |
861 | | // otherwise use binary tree insertion so that a sorted |
862 | | // list can easily be generated later |
863 | 10.8k | pptr = NULL; |
864 | 275k | for (;;) { |
865 | 275k | pptr = ptr; |
866 | 275k | if (strcmp(ep->getKey(), ptr->getKey()) <= 0) { |
867 | 270k | ptr = ptr->getNextEQ(); |
868 | 270k | if (!ptr) { |
869 | 10.0k | pptr->setNextEQ(ep); |
870 | 10.0k | break; |
871 | 10.0k | } |
872 | 270k | } else { |
873 | 5.09k | ptr = ptr->getNextNE(); |
874 | 5.09k | if (!ptr) { |
875 | 825 | pptr->setNextNE(ep); |
876 | 825 | break; |
877 | 825 | } |
878 | 5.09k | } |
879 | 275k | } |
880 | 10.8k | return 0; |
881 | 13.7k | } |
882 | | |
883 | | // convert from binary tree to sorted list |
884 | 18.3k | int AffixMgr::process_pfx_tree_to_list() { |
885 | 4.69M | for (int i = 1; i < SETSIZE; i++) { |
886 | 4.67M | pStart[i] = process_pfx_in_order(pStart[i], NULL); |
887 | 4.67M | } |
888 | 18.3k | return 0; |
889 | 18.3k | } |
890 | | |
891 | 4.70M | PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) { |
892 | 4.70M | if (ptr) { |
893 | 11.5k | nptr = process_pfx_in_order(ptr->getNextNE(), nptr); |
894 | 11.5k | ptr->setNext(nptr); |
895 | 11.5k | nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); |
896 | 11.5k | } |
897 | 4.70M | return nptr; |
898 | 4.70M | } |
899 | | |
900 | | // convert from binary tree to sorted list |
901 | 18.3k | int AffixMgr::process_sfx_tree_to_list() { |
902 | 4.69M | for (int i = 1; i < SETSIZE; i++) { |
903 | 4.67M | sStart[i] = process_sfx_in_order(sStart[i], NULL); |
904 | 4.67M | } |
905 | 18.3k | return 0; |
906 | 18.3k | } |
907 | | |
908 | 4.70M | SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) { |
909 | 4.70M | if (ptr) { |
910 | 13.7k | nptr = process_sfx_in_order(ptr->getNextNE(), nptr); |
911 | 13.7k | ptr->setNext(nptr); |
912 | 13.7k | nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); |
913 | 13.7k | } |
914 | 4.70M | return nptr; |
915 | 4.70M | } |
916 | | |
917 | | // reinitialize the PfxEntry links NextEQ and NextNE to speed searching |
918 | | // using the idea of leading subsets this time |
919 | 12.0k | int AffixMgr::process_pfx_order() { |
920 | 12.0k | PfxEntry* ptr; |
921 | | |
922 | | // loop through each prefix list starting point |
923 | 3.08M | for (int i = 1; i < SETSIZE; i++) { |
924 | 3.07M | ptr = pStart[i]; |
925 | | |
926 | | // look through the remainder of the list |
927 | | // and find next entry with affix that |
928 | | // the current one is not a subset of |
929 | | // mark that as destination for NextNE |
930 | | // use next in list that you are a subset |
931 | | // of as NextEQ |
932 | | |
933 | 3.07M | for (; ptr != NULL; ptr = ptr->getNext()) { |
934 | 5.75k | PfxEntry* nptr = ptr->getNext(); |
935 | 184k | for (; nptr != NULL; nptr = nptr->getNext()) { |
936 | 180k | if (!isSubset(ptr->getKey(), nptr->getKey())) |
937 | 1.36k | break; |
938 | 180k | } |
939 | 5.75k | ptr->setNextNE(nptr); |
940 | 5.75k | ptr->setNextEQ(NULL); |
941 | 5.75k | if ((ptr->getNext()) && |
942 | 5.75k | isSubset(ptr->getKey(), (ptr->getNext())->getKey())) |
943 | 3.63k | ptr->setNextEQ(ptr->getNext()); |
944 | 5.75k | } |
945 | | |
946 | | // now clean up by adding smart search termination strings: |
947 | | // if you are already a superset of the previous prefix |
948 | | // but not a subset of the next, search can end here |
949 | | // so set NextNE properly |
950 | | |
951 | 3.07M | ptr = pStart[i]; |
952 | 3.07M | for (; ptr != NULL; ptr = ptr->getNext()) { |
953 | 5.75k | PfxEntry* nptr = ptr->getNext(); |
954 | 5.75k | PfxEntry* mptr = NULL; |
955 | 184k | for (; nptr != NULL; nptr = nptr->getNext()) { |
956 | 180k | if (!isSubset(ptr->getKey(), nptr->getKey())) |
957 | 1.36k | break; |
958 | 178k | mptr = nptr; |
959 | 178k | } |
960 | 5.75k | if (mptr) |
961 | 3.63k | mptr->setNextNE(NULL); |
962 | 5.75k | } |
963 | 3.07M | } |
964 | 12.0k | return 0; |
965 | 12.0k | } |
966 | | |
967 | | // initialize the SfxEntry links NextEQ and NextNE to speed searching |
968 | | // using the idea of leading subsets this time |
969 | 12.0k | int AffixMgr::process_sfx_order() { |
970 | 12.0k | SfxEntry* ptr; |
971 | | |
972 | | // loop through each prefix list starting point |
973 | 3.08M | for (int i = 1; i < SETSIZE; i++) { |
974 | 3.07M | ptr = sStart[i]; |
975 | | |
976 | | // look through the remainder of the list |
977 | | // and find next entry with affix that |
978 | | // the current one is not a subset of |
979 | | // mark that as destination for NextNE |
980 | | // use next in list that you are a subset |
981 | | // of as NextEQ |
982 | | |
983 | 3.07M | for (; ptr != NULL; ptr = ptr->getNext()) { |
984 | 5.96k | SfxEntry* nptr = ptr->getNext(); |
985 | 59.7k | for (; nptr != NULL; nptr = nptr->getNext()) { |
986 | 55.2k | if (!isSubset(ptr->getKey(), nptr->getKey())) |
987 | 1.54k | break; |
988 | 55.2k | } |
989 | 5.96k | ptr->setNextNE(nptr); |
990 | 5.96k | ptr->setNextEQ(NULL); |
991 | 5.96k | if ((ptr->getNext()) && |
992 | 5.96k | isSubset(ptr->getKey(), (ptr->getNext())->getKey())) |
993 | 3.66k | ptr->setNextEQ(ptr->getNext()); |
994 | 5.96k | } |
995 | | |
996 | | // now clean up by adding smart search termination strings: |
997 | | // if you are already a superset of the previous suffix |
998 | | // but not a subset of the next, search can end here |
999 | | // so set NextNE properly |
1000 | | |
1001 | 3.07M | ptr = sStart[i]; |
1002 | 3.07M | for (; ptr != NULL; ptr = ptr->getNext()) { |
1003 | 5.96k | SfxEntry* nptr = ptr->getNext(); |
1004 | 5.96k | SfxEntry* mptr = NULL; |
1005 | 59.7k | for (; nptr != NULL; nptr = nptr->getNext()) { |
1006 | 55.2k | if (!isSubset(ptr->getKey(), nptr->getKey())) |
1007 | 1.54k | break; |
1008 | 53.7k | mptr = nptr; |
1009 | 53.7k | } |
1010 | 5.96k | if (mptr) |
1011 | 3.66k | mptr->setNextNE(NULL); |
1012 | 5.96k | } |
1013 | 3.07M | } |
1014 | 12.0k | return 0; |
1015 | 12.0k | } |
1016 | | |
1017 | | // add flags to the result for dictionary debugging |
1018 | 0 | std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { |
1019 | 0 | std::string st = encode_flag(flag); |
1020 | 0 | result.push_back(MSEP_FLD); |
1021 | 0 | result.append(MORPH_FLAG); |
1022 | 0 | result.append(st); |
1023 | 0 | return result; |
1024 | 0 | } |
1025 | | |
1026 | | // calculate the character length of the condition |
1027 | 24.7k | int AffixMgr::condlen(const std::string& s) { |
1028 | 24.7k | int l = 0; |
1029 | 24.7k | bool group = false; |
1030 | 24.7k | auto st = s.begin(), end = s.end(); |
1031 | 3.76M | while (st != end) { |
1032 | 3.74M | if (*st == '[') { |
1033 | 41.8k | group = true; |
1034 | 41.8k | l++; |
1035 | 3.70M | } else if (*st == ']') |
1036 | 16.9k | group = false; |
1037 | 3.68M | else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) |
1038 | 3.11M | l++; |
1039 | 3.74M | ++st; |
1040 | 3.74M | } |
1041 | 24.7k | return l; |
1042 | 24.7k | } |
1043 | | |
1044 | 27.8k | int AffixMgr::encodeit(AffEntry& entry, const std::string& cs) { |
1045 | 27.8k | if (cs.compare(".") != 0) { |
1046 | 24.7k | entry.numconds = (char)condlen(cs); |
1047 | 24.7k | const size_t cslen = cs.size(); |
1048 | 24.7k | const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen); |
1049 | 24.7k | memcpy(entry.c.conds, cs.data(), short_part); |
1050 | 24.7k | if (short_part < MAXCONDLEN) { |
1051 | | //blank out the remaining space |
1052 | 13.8k | memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part); |
1053 | 13.8k | } else if (cs[MAXCONDLEN]) { |
1054 | | //there is more conditions than fit in fixed space, so its |
1055 | | //a long condition |
1056 | 9.16k | entry.opts |= aeLONGCOND; |
1057 | 9.16k | size_t remaining = cs.size() - MAXCONDLEN_1; |
1058 | 9.16k | entry.c.l.conds2 = new char[1 + remaining]; |
1059 | 9.16k | memcpy(entry.c.l.conds2, cs.data() + MAXCONDLEN_1, remaining); |
1060 | 9.16k | entry.c.l.conds2[remaining] = 0; |
1061 | 9.16k | } |
1062 | 24.7k | } else { |
1063 | 3.11k | entry.numconds = 0; |
1064 | 3.11k | entry.c.conds[0] = '\0'; |
1065 | 3.11k | } |
1066 | 27.8k | return 0; |
1067 | 27.8k | } |
1068 | | |
1069 | | // return 1 if s1 is a leading subset of s2 (dots are for infixes) |
1070 | 80.6M | inline int AffixMgr::isSubset(const char* s1, const char* s2) { |
1071 | 104M | while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0') && (*s2 != '\0')) { |
1072 | 24.2M | s1++; |
1073 | 24.2M | s2++; |
1074 | 24.2M | } |
1075 | 80.6M | return (*s1 == '\0'); |
1076 | 80.6M | } |
1077 | | |
1078 | | // check word for prefixes |
1079 | | struct hentry* AffixMgr::prefix_check(const std::string& word, |
1080 | | int start, |
1081 | | int len, |
1082 | | char in_compound, |
1083 | 1.32G | const FLAG needflag) { |
1084 | 1.32G | struct hentry* rv = NULL; |
1085 | | |
1086 | 1.32G | pfx = NULL; |
1087 | 1.32G | pfxappnd = NULL; |
1088 | 1.32G | sfxappnd = NULL; |
1089 | 1.32G | sfxextra = 0; |
1090 | | |
1091 | | // first handle the special case of 0 length prefixes |
1092 | 1.32G | PfxEntry* pe = pStart[0]; |
1093 | 1.67G | while (pe) { |
1094 | 357M | if ( |
1095 | | // fogemorpheme |
1096 | 357M | ((in_compound != IN_CPD_NOT) || |
1097 | 357M | !(pe->getCont() && |
1098 | 55.4M | (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && |
1099 | | // permit prefixes in compounds |
1100 | 357M | ((in_compound != IN_CPD_END) || |
1101 | 355M | (pe->getCont() && |
1102 | 343M | (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) { |
1103 | | // check prefix |
1104 | 343M | rv = pe->checkword(word, start, len, in_compound, needflag); |
1105 | 343M | if (rv) { |
1106 | 7.18M | pfx = pe; // BUG: pfx not stateless |
1107 | 7.18M | return rv; |
1108 | 7.18M | } |
1109 | 343M | } |
1110 | 350M | pe = pe->getNext(); |
1111 | 350M | } |
1112 | | |
1113 | | // now handle the general case |
1114 | 1.31G | unsigned char sp = word[start]; |
1115 | 1.31G | PfxEntry* pptr = pStart[sp]; |
1116 | | |
1117 | 1.38G | while (pptr) { |
1118 | 70.2M | if (isSubset(pptr->getKey(), word.c_str() + start)) { |
1119 | 68.3M | if ( |
1120 | | // fogemorpheme |
1121 | 68.3M | ((in_compound != IN_CPD_NOT) || |
1122 | 68.3M | !(pptr->getCont() && |
1123 | 6.87M | (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && |
1124 | | // permit prefixes in compounds |
1125 | 68.3M | ((in_compound != IN_CPD_END) || |
1126 | 68.0M | (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag, |
1127 | 65.6M | pptr->getContLen()))))) { |
1128 | | // check prefix |
1129 | 65.6M | rv = pptr->checkword(word, start, len, in_compound, needflag); |
1130 | 65.6M | if (rv) { |
1131 | 285k | pfx = pptr; // BUG: pfx not stateless |
1132 | 285k | return rv; |
1133 | 285k | } |
1134 | 65.6M | } |
1135 | 68.0M | pptr = pptr->getNextEQ(); |
1136 | 68.0M | } else { |
1137 | 1.91M | pptr = pptr->getNextNE(); |
1138 | 1.91M | } |
1139 | 70.2M | } |
1140 | | |
1141 | 1.31G | return NULL; |
1142 | 1.31G | } |
1143 | | |
1144 | | // check word for prefixes and two-level suffixes |
1145 | | struct hentry* AffixMgr::prefix_check_twosfx(const std::string& word, |
1146 | | int start, |
1147 | | int len, |
1148 | | char in_compound, |
1149 | 50.6M | const FLAG needflag) { |
1150 | 50.6M | struct hentry* rv = NULL; |
1151 | | |
1152 | 50.6M | pfx = NULL; |
1153 | 50.6M | sfxappnd = NULL; |
1154 | 50.6M | sfxextra = 0; |
1155 | | |
1156 | | // first handle the special case of 0 length prefixes |
1157 | 50.6M | PfxEntry* pe = pStart[0]; |
1158 | | |
1159 | 114M | while (pe) { |
1160 | 64.2M | rv = pe->check_twosfx(word, start, len, in_compound, needflag); |
1161 | 64.2M | if (rv) |
1162 | 8.61k | return rv; |
1163 | 64.2M | pe = pe->getNext(); |
1164 | 64.2M | } |
1165 | | |
1166 | | // now handle the general case |
1167 | 50.6M | unsigned char sp = word[start]; |
1168 | 50.6M | PfxEntry* pptr = pStart[sp]; |
1169 | | |
1170 | 60.1M | while (pptr) { |
1171 | 9.46M | if (isSubset(pptr->getKey(), word.c_str() + start)) { |
1172 | 8.57M | rv = pptr->check_twosfx(word, start, len, in_compound, needflag); |
1173 | 8.57M | if (rv) { |
1174 | 886 | pfx = pptr; |
1175 | 886 | return rv; |
1176 | 886 | } |
1177 | 8.57M | pptr = pptr->getNextEQ(); |
1178 | 8.57M | } else { |
1179 | 890k | pptr = pptr->getNextNE(); |
1180 | 890k | } |
1181 | 9.46M | } |
1182 | | |
1183 | 50.6M | return NULL; |
1184 | 50.6M | } |
1185 | | |
1186 | | // check word for prefixes and morph |
1187 | | std::string AffixMgr::prefix_check_morph(const std::string& word, |
1188 | | int start, |
1189 | | int len, |
1190 | | char in_compound, |
1191 | 0 | const FLAG needflag) { |
1192 | |
|
1193 | 0 | std::string result; |
1194 | |
|
1195 | 0 | pfx = NULL; |
1196 | 0 | sfxappnd = NULL; |
1197 | 0 | sfxextra = 0; |
1198 | | |
1199 | | // first handle the special case of 0 length prefixes |
1200 | 0 | PfxEntry* pe = pStart[0]; |
1201 | 0 | while (pe) { |
1202 | 0 | std::string st = pe->check_morph(word, start, len, in_compound, needflag); |
1203 | 0 | if (!st.empty()) { |
1204 | 0 | result.append(st); |
1205 | 0 | } |
1206 | 0 | pe = pe->getNext(); |
1207 | 0 | } |
1208 | | |
1209 | | // now handle the general case |
1210 | 0 | unsigned char sp = word[start]; |
1211 | 0 | PfxEntry* pptr = pStart[sp]; |
1212 | |
|
1213 | 0 | while (pptr) { |
1214 | 0 | if (isSubset(pptr->getKey(), word.c_str() + start)) { |
1215 | 0 | std::string st = pptr->check_morph(word, start, len, in_compound, needflag); |
1216 | 0 | if (!st.empty()) { |
1217 | | // fogemorpheme |
1218 | 0 | if ((in_compound != IN_CPD_NOT) || |
1219 | 0 | !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, |
1220 | 0 | pptr->getContLen()))))) { |
1221 | 0 | result.append(st); |
1222 | 0 | pfx = pptr; |
1223 | 0 | } |
1224 | 0 | } |
1225 | 0 | pptr = pptr->getNextEQ(); |
1226 | 0 | } else { |
1227 | 0 | pptr = pptr->getNextNE(); |
1228 | 0 | } |
1229 | 0 | } |
1230 | |
|
1231 | 0 | return result; |
1232 | 0 | } |
1233 | | |
1234 | | // check word for prefixes and morph and two-level suffixes |
1235 | | std::string AffixMgr::prefix_check_twosfx_morph(const std::string& word, |
1236 | | int start, |
1237 | | int len, |
1238 | | char in_compound, |
1239 | 0 | const FLAG needflag) { |
1240 | 0 | std::string result; |
1241 | |
|
1242 | 0 | pfx = NULL; |
1243 | 0 | sfxappnd = NULL; |
1244 | 0 | sfxextra = 0; |
1245 | | |
1246 | | // first handle the special case of 0 length prefixes |
1247 | 0 | PfxEntry* pe = pStart[0]; |
1248 | 0 | while (pe) { |
1249 | 0 | std::string st = pe->check_twosfx_morph(word, start, len, in_compound, needflag); |
1250 | 0 | if (!st.empty()) { |
1251 | 0 | result.append(st); |
1252 | 0 | } |
1253 | 0 | pe = pe->getNext(); |
1254 | 0 | } |
1255 | | |
1256 | | // now handle the general case |
1257 | 0 | unsigned char sp = word[start]; |
1258 | 0 | PfxEntry* pptr = pStart[sp]; |
1259 | |
|
1260 | 0 | while (pptr) { |
1261 | 0 | if (isSubset(pptr->getKey(), word.c_str() + start)) { |
1262 | 0 | std::string st = pptr->check_twosfx_morph(word, start, len, in_compound, needflag); |
1263 | 0 | if (!st.empty()) { |
1264 | 0 | result.append(st); |
1265 | 0 | pfx = pptr; |
1266 | 0 | } |
1267 | 0 | pptr = pptr->getNextEQ(); |
1268 | 0 | } else { |
1269 | 0 | pptr = pptr->getNextNE(); |
1270 | 0 | } |
1271 | 0 | } |
1272 | |
|
1273 | 0 | return result; |
1274 | 0 | } |
1275 | | |
1276 | | // Is word a non-compound with a REP substitution (see checkcompoundrep)? |
1277 | 204k | int AffixMgr::cpdrep_check(const std::string& in_word, int wl) { |
1278 | | |
1279 | 204k | if ((wl < 2) || get_reptable().empty()) |
1280 | 168k | return 0; |
1281 | | |
1282 | 35.6k | std::string word(in_word, 0, wl); |
1283 | | |
1284 | 428k | for (const auto& i : get_reptable()) { |
1285 | | // use only available mid patterns |
1286 | 428k | if (!i.outstrings[0].empty()) { |
1287 | 427k | size_t r = 0; |
1288 | 427k | const size_t lenp = i.pattern.size(); |
1289 | | // search every occurence of the pattern in the word |
1290 | 1.45M | while ((r = word.find(i.pattern, r)) != std::string::npos) { |
1291 | 1.03M | std::string candidate(word); |
1292 | 1.03M | candidate.replace(r, lenp, i.outstrings[0]); |
1293 | 1.03M | if (candidate_check(candidate)) |
1294 | 9.56k | return 1; |
1295 | 1.02M | ++r; // search for the next letter |
1296 | 1.02M | } |
1297 | 427k | } |
1298 | 428k | } |
1299 | | |
1300 | 26.1k | return 0; |
1301 | 35.6k | } |
1302 | | |
1303 | | // forbid compound words, if they are in the dictionary as a |
1304 | | // word pair separated by space |
1305 | 802k | int AffixMgr::cpdwordpair_check(const std::string& word, int wl) { |
1306 | 802k | if (wl > 2) { |
1307 | 713k | std::string candidate(word, 0, wl); |
1308 | 17.1M | for (size_t i = 1; i < candidate.size(); i++) { |
1309 | | // go to end of the UTF-8 character |
1310 | 16.5M | if (utf8 && ((candidate[i] & 0xc0) == 0x80)) |
1311 | 270k | continue; |
1312 | 16.2M | candidate.insert(i, 1, ' '); |
1313 | 16.2M | if (candidate_check(candidate)) |
1314 | 85.6k | return 1; |
1315 | 16.1M | candidate.erase(i, 1); |
1316 | 16.1M | } |
1317 | 713k | } |
1318 | | |
1319 | 717k | return 0; |
1320 | 802k | } |
1321 | | |
1322 | | // forbid compoundings when there are special patterns at word bound |
1323 | | int AffixMgr::cpdpat_check(const std::string& word, |
1324 | | size_t pos, |
1325 | | hentry* r1, |
1326 | | hentry* r2, |
1327 | 458k | const char /*affixed*/) { |
1328 | 464k | for (auto& i : checkcpdtable) { |
1329 | 464k | size_t len; |
1330 | 464k | if (isSubset(i.pattern2.c_str(), word.c_str() + pos) && |
1331 | 464k | (!r1 || !i.cond || |
1332 | 108k | (r1->astr && TESTAFF(r1->astr, i.cond, r1->alen))) && |
1333 | 464k | (!r2 || !i.cond2 || |
1334 | 101k | (r2->astr && TESTAFF(r2->astr, i.cond2, r2->alen))) && |
1335 | | // zero length pattern => only TESTAFF |
1336 | | // zero pattern (0/flag) => unmodified stem (zero affixes allowed) |
1337 | 464k | (i.pattern.empty() || |
1338 | 91.3k | ((i.pattern[0] == '0' && r1->blen <= pos && |
1339 | 79.0k | strncmp(word.c_str() + pos - r1->blen, r1->word, r1->blen) == 0) || |
1340 | 79.0k | (i.pattern[0] != '0' && |
1341 | 78.4k | ((len = i.pattern.size()) != 0) && len <= pos && |
1342 | 78.4k | strncmp(word.c_str() + pos - len, i.pattern.c_str(), len) == 0)))) { |
1343 | 40.8k | return 1; |
1344 | 40.8k | } |
1345 | 464k | } |
1346 | 417k | return 0; |
1347 | 458k | } |
1348 | | |
1349 | | // forbid compounding with neighbouring upper and lower case characters at word |
1350 | | // bounds |
1351 | 636k | int AffixMgr::cpdcase_check(const std::string& word, int pos) { |
1352 | 636k | if (utf8) { |
1353 | 90.2k | const char* p; |
1354 | 90.2k | const char* wordp = word.c_str(); |
1355 | 94.9k | for (p = wordp + pos - 1; p > wordp && (*p & 0xc0) == 0x80; p--) |
1356 | 4.74k | ; |
1357 | 90.2k | std::string pair(p); |
1358 | 90.2k | std::vector<w_char> pair_u; |
1359 | 90.2k | u8_u16(pair_u, pair); |
1360 | 90.2k | unsigned short a = pair_u.size() > 1 ? (unsigned short)pair_u[1] : 0, |
1361 | 90.2k | b = !pair_u.empty() ? (unsigned short)pair_u[0] : 0; |
1362 | 90.2k | if (((unicodetoupper(a, langnum) == a) || |
1363 | 90.2k | (unicodetoupper(b, langnum) == b)) && |
1364 | 90.2k | (a != '-') && (b != '-')) |
1365 | 84.9k | return 1; |
1366 | 545k | } else { |
1367 | 545k | const unsigned char a = word[pos - 1], b = word[pos]; |
1368 | 545k | if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) |
1369 | 22.9k | return 1; |
1370 | 545k | } |
1371 | 528k | return 0; |
1372 | 636k | } |
1373 | | |
1374 | | struct metachar_data { |
1375 | | signed short btpp; // metacharacter (*, ?) position for backtracking |
1376 | | signed short btwp; // word position for metacharacters |
1377 | | int btnum; // number of matched characters in metacharacter |
1378 | | }; |
1379 | | |
1380 | | // check compound patterns |
1381 | | int AffixMgr::defcpd_check(hentry*** words, |
1382 | | short wnum, |
1383 | | hentry* rv, |
1384 | | hentry** def, |
1385 | 3.43M | char all) { |
1386 | 3.43M | int w = 0; |
1387 | | |
1388 | 3.43M | if (!*words) { |
1389 | 3.28M | w = 1; |
1390 | 3.28M | *words = def; |
1391 | 3.28M | } |
1392 | | |
1393 | 3.43M | if (!*words) { |
1394 | 0 | return 0; |
1395 | 0 | } |
1396 | | |
1397 | 3.43M | std::vector<metachar_data> btinfo(1); |
1398 | | |
1399 | 3.43M | short bt = 0; |
1400 | | |
1401 | 3.43M | (*words)[wnum] = rv; |
1402 | | |
1403 | | // has the last word COMPOUNDRULE flag? |
1404 | 3.43M | if (rv->alen == 0) { |
1405 | 2.43M | (*words)[wnum] = NULL; |
1406 | 2.43M | if (w) |
1407 | 2.39M | *words = NULL; |
1408 | 2.43M | return 0; |
1409 | 2.43M | } |
1410 | 1.00M | int ok = 0; |
1411 | 2.33M | for (auto& i : defcpdtable) { |
1412 | 8.96M | for (auto& j : i) { |
1413 | 8.96M | if (j != '*' && j != '?' && |
1414 | 8.96M | TESTAFF(rv->astr, j, rv->alen)) { |
1415 | 923k | ok = 1; |
1416 | 923k | break; |
1417 | 923k | } |
1418 | 8.96M | } |
1419 | 2.33M | } |
1420 | 1.00M | if (ok == 0) { |
1421 | 120k | (*words)[wnum] = NULL; |
1422 | 120k | if (w) |
1423 | 117k | *words = NULL; |
1424 | 120k | return 0; |
1425 | 120k | } |
1426 | | |
1427 | 1.44M | for (auto& i : defcpdtable) { |
1428 | 1.44M | size_t pp = 0; // pattern position |
1429 | 1.44M | signed short wp = 0; // "words" position |
1430 | 1.44M | int ok2 = 1; |
1431 | 1.44M | ok = 1; |
1432 | 1.80M | do { |
1433 | 2.69M | while ((pp < i.size()) && (wp <= wnum)) { |
1434 | 2.08M | if (((pp + 1) < i.size()) && |
1435 | 2.08M | ((i[pp + 1] == '*') || |
1436 | 1.90M | (i[pp + 1] == '?'))) { |
1437 | 546k | int wend = (i[pp + 1] == '?') ? wp : wnum; |
1438 | 546k | ok2 = 1; |
1439 | 546k | pp += 2; |
1440 | 546k | btinfo[bt].btpp = pp; |
1441 | 546k | btinfo[bt].btwp = wp; |
1442 | 916k | while (wp <= wend) { |
1443 | 592k | if (!(*words)[wp] || |
1444 | 592k | !(*words)[wp]->alen || |
1445 | 592k | !TESTAFF((*words)[wp]->astr, i[pp - 2], |
1446 | 592k | (*words)[wp]->alen)) { |
1447 | 221k | ok2 = 0; |
1448 | 221k | break; |
1449 | 221k | } |
1450 | 370k | wp++; |
1451 | 370k | } |
1452 | 546k | if (wp <= wnum) |
1453 | 221k | ok2 = 0; |
1454 | 546k | btinfo[bt].btnum = wp - btinfo[bt].btwp; |
1455 | 546k | if (btinfo[bt].btnum > 0) { |
1456 | 327k | ++bt; |
1457 | 327k | btinfo.resize(bt+1); |
1458 | 327k | } |
1459 | 546k | if (ok2) |
1460 | 324k | break; |
1461 | 1.54M | } else { |
1462 | 1.54M | ok2 = 1; |
1463 | 1.54M | if (!(*words)[wp] || !(*words)[wp]->alen || |
1464 | 1.54M | !TESTAFF((*words)[wp]->astr, i[pp], |
1465 | 1.54M | (*words)[wp]->alen)) { |
1466 | 873k | ok = 0; |
1467 | 873k | break; |
1468 | 873k | } |
1469 | 669k | pp++; |
1470 | 669k | wp++; |
1471 | 669k | if ((i.size() == pp) && !(wp > wnum)) |
1472 | 4.86k | ok = 0; |
1473 | 669k | } |
1474 | 2.08M | } |
1475 | 1.80M | if (ok && ok2) { |
1476 | 927k | size_t r = pp; |
1477 | 1.56M | while ((i.size() > r) && ((r + 1) < i.size()) && |
1478 | 1.56M | ((i[r + 1] == '*') || |
1479 | 1.25M | (i[r + 1] == '?'))) |
1480 | 635k | r += 2; |
1481 | 927k | if (i.size() <= r) |
1482 | 111k | return 1; |
1483 | 927k | } |
1484 | | // backtrack |
1485 | 1.69M | if (bt) |
1486 | 686k | do { |
1487 | 686k | ok = 1; |
1488 | 686k | btinfo[bt - 1].btnum--; |
1489 | 686k | pp = btinfo[bt - 1].btpp; |
1490 | 686k | wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum; |
1491 | 686k | } while ((btinfo[bt - 1].btnum < 0) && --bt); |
1492 | 1.69M | } while (bt); |
1493 | | |
1494 | 1.33M | if (ok && ok2 && (!all || (i.size() <= pp))) |
1495 | 694k | return 1; |
1496 | | |
1497 | | // check zero ending |
1498 | 637k | while (ok && ok2 && (i.size() > pp) && |
1499 | 637k | ((pp + 1) < i.size()) && |
1500 | 637k | ((i[pp + 1] == '*') || |
1501 | 2.60k | (i[pp + 1] == '?'))) |
1502 | 1.32k | pp += 2; |
1503 | 635k | if (ok && ok2 && (i.size() <= pp)) |
1504 | 0 | return 1; |
1505 | 635k | } |
1506 | 78.8k | (*words)[wnum] = NULL; |
1507 | 78.8k | if (w) |
1508 | 51.9k | *words = NULL; |
1509 | 78.8k | return 0; |
1510 | 883k | } |
1511 | | |
1512 | 17.2M | inline int AffixMgr::candidate_check(const std::string& word) { |
1513 | | |
1514 | 17.2M | struct hentry* rv = lookup(word.c_str(), word.size()); |
1515 | 17.2M | if (rv) |
1516 | 36.3k | return 1; |
1517 | | |
1518 | | // rv = prefix_check(word,0,len,1); |
1519 | | // if (rv) return 1; |
1520 | | |
1521 | 17.2M | rv = affix_check(word, 0, word.size()); |
1522 | 17.2M | if (rv) |
1523 | 58.8k | return 1; |
1524 | 17.1M | return 0; |
1525 | 17.2M | } |
1526 | | |
1527 | | // calculate number of syllable for compound-checking |
1528 | 15.7M | short AffixMgr::get_syllable(const std::string& word) { |
1529 | 15.7M | if (cpdmaxsyllable == 0) |
1530 | 14.8M | return 0; |
1531 | | |
1532 | 935k | short num = 0; |
1533 | | |
1534 | 935k | if (!utf8) { |
1535 | 932k | num = (short)std::count_if(word.begin(), word.end(), |
1536 | 7.92M | [&](char c) { |
1537 | 7.92M | return std::binary_search(cpdvowels.begin(), cpdvowels.end(), c); |
1538 | 7.92M | }); |
1539 | 932k | } else if (!cpdvowels_utf16.empty()) { |
1540 | 3.08k | std::vector<w_char> w; |
1541 | 3.08k | u8_u16(w, word); |
1542 | 3.08k | num = (short)std::count_if(w.begin(), w.end(), |
1543 | 23.0k | [&](w_char wc) { |
1544 | 23.0k | return std::binary_search(cpdvowels_utf16.begin(), cpdvowels_utf16.end(), wc); |
1545 | 23.0k | }); |
1546 | 3.08k | } |
1547 | | |
1548 | 935k | return num; |
1549 | 15.7M | } |
1550 | | |
1551 | 40.3M | void AffixMgr::setcminmax(size_t* cmin, size_t* cmax, const char* word, size_t len) { |
1552 | 40.3M | if (utf8) { |
1553 | 6.40M | int i; |
1554 | 46.8M | for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) { |
1555 | 47.3M | for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++) |
1556 | 6.88M | ; |
1557 | 40.4M | } |
1558 | 40.6M | for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax > 0; i++) { |
1559 | 41.1M | for ((*cmax)--; *cmax > 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--) |
1560 | 6.87M | ; |
1561 | 34.2M | } |
1562 | 33.8M | } else { |
1563 | 33.8M | *cmin = cpdmin; |
1564 | 33.8M | *cmax = len - cpdmin + 1; |
1565 | 33.8M | } |
1566 | 40.3M | } |
1567 | | |
1568 | | // check if compound word is correctly spelled |
1569 | | // hu_mov_rule = spec. Hungarian rule (XXX) |
1570 | | struct hentry* AffixMgr::compound_check(const std::string& word, |
1571 | | short wordnum, |
1572 | | short numsyllable, |
1573 | | short maxwordnum, |
1574 | | short wnum, |
1575 | | hentry** words = NULL, |
1576 | | hentry** rwords = NULL, |
1577 | | char hu_mov_rule = 0, |
1578 | | char is_sug = 0, |
1579 | 30.7M | int* info = NULL) { |
1580 | 30.7M | short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; |
1581 | 30.7M | hentry *rv = NULL, *rv_first; |
1582 | 30.7M | std::string st; |
1583 | 30.7M | char ch = '\0', affixed; |
1584 | 30.7M | size_t cmin, cmax; |
1585 | 30.7M | int striple = 0, soldi = 0, oldcmin = 0, oldcmax = 0, oldlen = 0, checkedstriple = 0; |
1586 | 30.7M | hentry** oldwords = words; |
1587 | 30.7M | size_t scpd = 0, len = word.size(); |
1588 | | |
1589 | 30.7M | int checked_prefix; |
1590 | | |
1591 | | // add a time limit to handle possible |
1592 | | // combinatorical explosion of the overlapping words |
1593 | | |
1594 | 30.7M | HUNSPELL_THREAD_LOCAL std::chrono::steady_clock::time_point clock_time_start; |
1595 | 30.7M | HUNSPELL_THREAD_LOCAL bool timelimit_exceeded; |
1596 | | |
1597 | | // get the current time |
1598 | 30.7M | std::chrono::steady_clock::time_point clock_now = std::chrono::steady_clock::now(); |
1599 | | |
1600 | 30.7M | if (wordnum == 0) { |
1601 | | // set the start time |
1602 | 16.2M | clock_time_start = clock_now; |
1603 | 16.2M | timelimit_exceeded = false; |
1604 | 16.2M | } |
1605 | 14.5M | else if (std::chrono::duration_cast<std::chrono::milliseconds>(clock_now - clock_time_start).count() |
1606 | 14.5M | > static_cast<double>(TIMELIMIT) * CLOCKS_PER_SEC * 1000) |
1607 | 0 | timelimit_exceeded = true; |
1608 | | |
1609 | 30.7M | setcminmax(&cmin, &cmax, word.c_str(), len); |
1610 | | |
1611 | 30.7M | st.assign(word); |
1612 | | |
1613 | 1.17G | for (size_t i = cmin; i < cmax; ++i) { |
1614 | | // go to end of the UTF-8 character |
1615 | 1.14G | if (utf8) { |
1616 | 580M | for (; (st[i] & 0xc0) == 0x80; i++) |
1617 | 241M | ; |
1618 | 338M | if (i >= cmax) |
1619 | 546k | return NULL; |
1620 | 338M | } |
1621 | | |
1622 | 1.14G | words = oldwords; |
1623 | 1.14G | int onlycpdrule = (words) ? 1 : 0; |
1624 | | |
1625 | 1.23G | do { // onlycpdrule loop |
1626 | | |
1627 | 1.23G | oldnumsyllable = numsyllable; |
1628 | 1.23G | oldwordnum = wordnum; |
1629 | 1.23G | checked_prefix = 0; |
1630 | | |
1631 | 1.29G | do { // simplified checkcompoundpattern loop |
1632 | | |
1633 | 1.29G | if (timelimit_exceeded) |
1634 | 0 | return 0; |
1635 | | |
1636 | 1.29G | if (scpd > 0) { |
1637 | 128M | for (; scpd <= checkcpdtable.size() && |
1638 | 128M | (checkcpdtable[scpd - 1].pattern3.empty() || |
1639 | 81.4M | i > word.size() || |
1640 | 81.4M | word.compare(i, checkcpdtable[scpd - 1].pattern3.size(), checkcpdtable[scpd - 1].pattern3) != 0); |
1641 | 71.9M | scpd++) |
1642 | 71.9M | ; |
1643 | | |
1644 | 56.4M | if (scpd > checkcpdtable.size()) |
1645 | 46.9M | break; // break simplified checkcompoundpattern loop |
1646 | 9.50M | st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern); |
1647 | 9.50M | soldi = i; |
1648 | 9.50M | i += checkcpdtable[scpd - 1].pattern.size(); |
1649 | 9.50M | st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2); |
1650 | 9.50M | st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos, |
1651 | 9.50M | word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size())); |
1652 | | |
1653 | 9.50M | oldlen = len; |
1654 | 9.50M | len += checkcpdtable[scpd - 1].pattern.size() + |
1655 | 9.50M | checkcpdtable[scpd - 1].pattern2.size() - |
1656 | 9.50M | checkcpdtable[scpd - 1].pattern3.size(); |
1657 | 9.50M | oldcmin = cmin; |
1658 | 9.50M | oldcmax = cmax; |
1659 | 9.50M | setcminmax(&cmin, &cmax, st.c_str(), len); |
1660 | | |
1661 | 9.50M | cmax = len - cpdmin + 1; |
1662 | 9.50M | } |
1663 | | |
1664 | 1.24G | if (i > st.size()) |
1665 | 227k | return NULL; |
1666 | | |
1667 | 1.24G | ch = st[i]; |
1668 | 1.24G | st[i] = '\0'; |
1669 | | |
1670 | 1.24G | sfx = NULL; |
1671 | 1.24G | pfx = NULL; |
1672 | | |
1673 | | // FIRST WORD |
1674 | | |
1675 | 1.24G | affixed = 1; |
1676 | 1.24G | rv = lookup(st.c_str(), i); // perhaps without prefix |
1677 | | |
1678 | | // forbid dictionary stems with COMPOUNDFORBIDFLAG in |
1679 | | // compound words, overriding the effect of COMPOUNDPERMITFLAG |
1680 | 1.24G | if ((rv) && compoundforbidflag && |
1681 | 1.24G | TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) { |
1682 | 72.3k | bool would_continue = !onlycpdrule && simplifiedcpd; |
1683 | 72.3k | if (!scpd && would_continue) { |
1684 | | // given the while conditions that continue jumps to, this situation |
1685 | | // never ends |
1686 | 52.8k | HUNSPELL_WARNING(stderr, "break infinite loop\n"); |
1687 | 52.8k | break; |
1688 | 52.8k | } |
1689 | | |
1690 | 19.4k | if (scpd > 0 && would_continue) { |
1691 | | // under these conditions we loop again, but the assumption above |
1692 | | // appears to be that cmin and cmax are the original values they |
1693 | | // had in the outside loop |
1694 | 2.24k | cmin = oldcmin; |
1695 | 2.24k | cmax = oldcmax; |
1696 | 2.24k | } |
1697 | 19.4k | continue; |
1698 | 72.3k | } |
1699 | | |
1700 | | // search homonym with compound flag |
1701 | 1.32G | while ((rv) && !hu_mov_rule && |
1702 | 1.32G | ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
1703 | 89.5M | !((compoundflag && !words && !onlycpdrule && |
1704 | 89.5M | TESTAFF(rv->astr, compoundflag, rv->alen)) || |
1705 | 89.5M | (compoundbegin && !wordnum && !onlycpdrule && |
1706 | 77.0M | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
1707 | 89.5M | (compoundmiddle && wordnum && !words && !onlycpdrule && |
1708 | 76.6M | TESTAFF(rv->astr, compoundmiddle, rv->alen)) || |
1709 | 89.5M | (!defcpdtable.empty() && onlycpdrule && |
1710 | 76.4M | ((!words && !wordnum && |
1711 | 3.43M | defcpd_check(&words, wnum, rv, rwords, 0)) || |
1712 | 3.43M | (words && |
1713 | 2.71M | defcpd_check(&words, wnum, rv, rwords, 0))))) || |
1714 | 89.5M | (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL && |
1715 | 75.7M | !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) { |
1716 | 75.7M | rv = rv->next_homonym; |
1717 | 75.7M | } |
1718 | | |
1719 | 1.24G | if (rv) |
1720 | 13.8M | affixed = 0; |
1721 | | |
1722 | 1.24G | if (!rv) { |
1723 | 1.23G | if (onlycpdrule) |
1724 | 107M | break; |
1725 | 1.12G | if (compoundflag && |
1726 | 1.12G | !(rv = prefix_check(st, 0, i, |
1727 | 905M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
1728 | 905M | compoundflag))) { |
1729 | 898M | if (((rv = suffix_check( |
1730 | 898M | st, 0, i, 0, NULL, FLAG_NULL, compoundflag, |
1731 | 898M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
1732 | 898M | (compoundmoresuffixes && |
1733 | 898M | (rv = suffix_check_twosfx(st, 0, i, 0, NULL, compoundflag)))) && |
1734 | 898M | !hu_mov_rule && sfx->getCont() && |
1735 | 898M | ((compoundforbidflag && |
1736 | 27.3k | TESTAFF(sfx->getCont(), compoundforbidflag, |
1737 | 27.3k | sfx->getContLen())) || |
1738 | 27.3k | (compoundend && |
1739 | 25.7k | TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { |
1740 | 2.03k | rv = NULL; |
1741 | 2.03k | } |
1742 | 898M | } |
1743 | | |
1744 | 1.12G | if (rv || |
1745 | 1.12G | (((wordnum == 0) && compoundbegin && |
1746 | 1.11G | ((rv = suffix_check( |
1747 | 149M | st, 0, i, 0, NULL, FLAG_NULL, compoundbegin, |
1748 | 149M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
1749 | 149M | (compoundmoresuffixes && |
1750 | 149M | (rv = suffix_check_twosfx( |
1751 | 12.8M | st, 0, i, 0, NULL, |
1752 | 12.8M | compoundbegin))) || // twofold suffixes + compound |
1753 | 149M | (rv = prefix_check(st, 0, i, |
1754 | 149M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
1755 | 149M | compoundbegin)))) || |
1756 | 1.11G | ((wordnum > 0) && compoundmiddle && |
1757 | 1.11G | ((rv = suffix_check( |
1758 | 9.43M | st, 0, i, 0, NULL, FLAG_NULL, compoundmiddle, |
1759 | 9.43M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
1760 | 9.43M | (compoundmoresuffixes && |
1761 | 9.42M | (rv = suffix_check_twosfx( |
1762 | 5.25M | st, 0, i, 0, NULL, |
1763 | 5.25M | compoundmiddle))) || // twofold suffixes + compound |
1764 | 9.43M | (rv = prefix_check(st, 0, i, |
1765 | 9.41M | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
1766 | 9.41M | compoundmiddle)))))) |
1767 | 7.28M | checked_prefix = 1; |
1768 | | // else check forbiddenwords and needaffix |
1769 | 1.12G | } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
1770 | 13.8M | TESTAFF(rv->astr, needaffix, rv->alen) || |
1771 | 13.8M | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
1772 | 13.8M | (is_sug && nosuggest && |
1773 | 12.5M | TESTAFF(rv->astr, nosuggest, rv->alen)))) { |
1774 | 1.27M | st[i] = ch; |
1775 | | // continue; |
1776 | 1.27M | break; |
1777 | 1.27M | } |
1778 | | |
1779 | | // check non_compound flag in suffix and prefix |
1780 | 1.13G | if ((rv) && !hu_mov_rule && |
1781 | 1.13G | ((pfx && pfx->getCont() && |
1782 | 19.8M | TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || |
1783 | 19.8M | (sfx && sfx->getCont() && |
1784 | 19.7M | TESTAFF(sfx->getCont(), compoundforbidflag, |
1785 | 19.7M | sfx->getContLen())))) { |
1786 | 29.5k | rv = NULL; |
1787 | 29.5k | } |
1788 | | |
1789 | | // check compoundend flag in suffix and prefix |
1790 | 1.13G | if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && |
1791 | 1.13G | ((pfx && pfx->getCont() && |
1792 | 292k | TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || |
1793 | 292k | (sfx && sfx->getCont() && |
1794 | 292k | TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { |
1795 | 0 | rv = NULL; |
1796 | 0 | } |
1797 | | |
1798 | | // check compoundmiddle flag in suffix and prefix |
1799 | 1.13G | if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && |
1800 | 1.13G | !hu_mov_rule && |
1801 | 1.13G | ((pfx && pfx->getCont() && |
1802 | 264k | TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || |
1803 | 264k | (sfx && sfx->getCont() && |
1804 | 264k | TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { |
1805 | 0 | rv = NULL; |
1806 | 0 | } |
1807 | | |
1808 | | // check forbiddenwords |
1809 | 1.13G | if ((rv) && (rv->astr) && |
1810 | 1.13G | (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
1811 | 19.8M | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
1812 | 19.8M | (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { |
1813 | 392k | return NULL; |
1814 | 392k | } |
1815 | | |
1816 | | // increment word number, if the second root has a compoundroot flag |
1817 | 1.13G | if ((rv) && compoundroot && |
1818 | 1.13G | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
1819 | 139k | wordnum++; |
1820 | 139k | } |
1821 | | |
1822 | | // first word is acceptable in compound words? |
1823 | 1.13G | if (((rv) && |
1824 | 1.13G | (checked_prefix || (words && words[wnum]) || |
1825 | 19.4M | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
1826 | 19.4M | ((oldwordnum == 0) && compoundbegin && |
1827 | 401k | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
1828 | 19.4M | ((oldwordnum > 0) && compoundmiddle && |
1829 | 105k | TESTAFF(rv->astr, compoundmiddle, rv->alen)) |
1830 | | |
1831 | | // LANG_hu section: spec. Hungarian rule |
1832 | 19.4M | || ((langnum == LANG_hu) && hu_mov_rule && |
1833 | 10.9k | (TESTAFF( |
1834 | 10.6k | rv->astr, 'F', |
1835 | 10.6k | rv->alen) || // XXX hardwired Hungarian dictionary codes |
1836 | 10.6k | TESTAFF(rv->astr, 'G', rv->alen) || |
1837 | 10.6k | TESTAFF(rv->astr, 'H', rv->alen))) |
1838 | | // END of LANG_hu section |
1839 | 19.4M | ) && |
1840 | 1.13G | ( |
1841 | | // test CHECKCOMPOUNDPATTERN conditions |
1842 | 19.4M | scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL || |
1843 | 19.4M | TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) && |
1844 | 1.13G | !((checkcompoundtriple && scpd == 0 && |
1845 | 19.4M | !words && i < word.size() && // test triple letters |
1846 | 19.4M | (word[i - 1] == word[i]) && |
1847 | 19.4M | (((i > 1) && (word[i - 1] == word[i - 2])) || |
1848 | 81.1k | ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' |
1849 | 81.1k | )) || |
1850 | 19.4M | (checkcompoundcase && scpd == 0 && !words && i < word.size() && |
1851 | 19.3M | cpdcase_check(word, i)))) |
1852 | | // LANG_hu section: spec. Hungarian rule |
1853 | 1.13G | || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && |
1854 | 1.11G | (rv = affix_check(st, 0, i)) && |
1855 | 1.11G | (sfx && sfx->getCont() && |
1856 | 9.45k | ( // XXX hardwired Hungarian dic. codes |
1857 | 4.18k | TESTAFF(sfx->getCont(), (unsigned short)'x', |
1858 | 4.18k | sfx->getContLen()) || |
1859 | 4.18k | TESTAFF( |
1860 | 4.18k | sfx->getCont(), (unsigned short)'%', |
1861 | 19.2M | sfx->getContLen()))))) { // first word is ok condition |
1862 | | |
1863 | | // LANG_hu section: spec. Hungarian rule |
1864 | 19.2M | if (langnum == LANG_hu) { |
1865 | | // calculate syllable number of the word |
1866 | 5.59M | numsyllable += get_syllable(st.substr(0, i)); |
1867 | | // + 1 word, if syllable number of the prefix > 1 (hungarian |
1868 | | // convention) |
1869 | 5.59M | if (pfx && (get_syllable(pfx->getKey()) > 1)) |
1870 | 15.4k | wordnum++; |
1871 | 5.59M | } |
1872 | | // END of LANG_hu section |
1873 | | |
1874 | | // NEXT WORD(S) |
1875 | 19.2M | rv_first = rv; |
1876 | 19.2M | st[i] = ch; |
1877 | | |
1878 | 19.4M | do { // striple loop |
1879 | | |
1880 | | // check simplifiedtriple |
1881 | 19.4M | if (simplifiedtriple) { |
1882 | 489k | if (striple) { |
1883 | 203k | checkedstriple = 1; |
1884 | 203k | i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" |
1885 | 285k | } else if (i > 2 && i <= word.size() && word[i - 1] == word[i - 2]) |
1886 | 205k | striple = 1; |
1887 | 489k | } |
1888 | | |
1889 | 19.4M | rv = lookup(st.c_str() + i, st.size() - i); // perhaps without prefix |
1890 | | |
1891 | | // search homonym with compound flag |
1892 | 20.4M | while ((rv) && |
1893 | 20.4M | ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
1894 | 1.20M | !((compoundflag && !words && |
1895 | 1.20M | TESTAFF(rv->astr, compoundflag, rv->alen)) || |
1896 | 1.20M | (compoundend && !words && |
1897 | 906k | TESTAFF(rv->astr, compoundend, rv->alen)) || |
1898 | 1.20M | (!defcpdtable.empty() && words && |
1899 | 890k | defcpd_check(&words, wnum + 1, rv, NULL, 1))) || |
1900 | 1.20M | (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL && |
1901 | 317k | !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, |
1902 | 1.03M | rv->alen)))) { |
1903 | 1.03M | rv = rv->next_homonym; |
1904 | 1.03M | } |
1905 | | |
1906 | | // check FORCEUCASE |
1907 | 19.4M | if (rv && forceucase && |
1908 | 19.4M | (TESTAFF(rv->astr, forceucase, rv->alen)) && |
1909 | 19.4M | !(info && *info & SPELL_ORIGCAP)) |
1910 | 4.56k | rv = NULL; |
1911 | | |
1912 | 19.4M | if (rv && words && words[wnum + 1]) |
1913 | 538 | return rv_first; |
1914 | | |
1915 | 19.4M | oldnumsyllable2 = numsyllable; |
1916 | 19.4M | oldwordnum2 = wordnum; |
1917 | | |
1918 | | // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary |
1919 | | // code |
1920 | 19.4M | if ((rv) && (langnum == LANG_hu) && |
1921 | 19.4M | (TESTAFF(rv->astr, 'I', rv->alen)) && |
1922 | 19.4M | !(TESTAFF(rv->astr, 'J', rv->alen))) { |
1923 | 18.5k | numsyllable--; |
1924 | 18.5k | } |
1925 | | // END of LANG_hu section |
1926 | | |
1927 | | // increment word number, if the second root has a compoundroot flag |
1928 | 19.4M | if ((rv) && (compoundroot) && |
1929 | 19.4M | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
1930 | 6.20k | wordnum++; |
1931 | 6.20k | } |
1932 | | |
1933 | | // check forbiddenwords |
1934 | 19.4M | if ((rv) && (rv->astr) && |
1935 | 19.4M | (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
1936 | 169k | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
1937 | 169k | (is_sug && nosuggest && |
1938 | 164k | TESTAFF(rv->astr, nosuggest, rv->alen)))) |
1939 | 5.09k | return NULL; |
1940 | | |
1941 | | // second word is acceptable, as a root? |
1942 | | // hungarian conventions: compounding is acceptable, |
1943 | | // when compound forms consist of 2 words, or if more, |
1944 | | // then the syllable number of root words must be 6, or lesser. |
1945 | | |
1946 | 19.4M | if ((rv) && |
1947 | 19.4M | ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
1948 | 164k | (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && |
1949 | 19.4M | (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || |
1950 | 164k | ((cpdmaxsyllable != 0) && |
1951 | 24.7k | (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= |
1952 | 17.6k | cpdmaxsyllable))) && |
1953 | 19.4M | ( |
1954 | | // test CHECKCOMPOUNDPATTERN |
1955 | 155k | checkcpdtable.empty() || scpd != 0 || |
1956 | 155k | (i < word.size() && !cpdpat_check(word, i, rv_first, rv, 0))) && |
1957 | 19.4M | ((!checkcompounddup || (rv != rv_first))) |
1958 | | // test CHECKCOMPOUNDPATTERN conditions |
1959 | 19.4M | && |
1960 | 19.4M | (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || |
1961 | 130k | TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) { |
1962 | | // forbid compound word, if it is a non-compound word with typical |
1963 | | // fault |
1964 | 130k | if ((checkcompoundrep && cpdrep_check(word, len)) || |
1965 | 130k | cpdwordpair_check(word, len)) |
1966 | 27.9k | return NULL; |
1967 | 102k | return rv_first; |
1968 | 130k | } |
1969 | | |
1970 | 19.3M | numsyllable = oldnumsyllable2; |
1971 | 19.3M | wordnum = oldwordnum2; |
1972 | | |
1973 | | // perhaps second word has prefix or/and suffix |
1974 | 19.3M | sfx = NULL; |
1975 | 19.3M | sfxflag = FLAG_NULL; |
1976 | 19.3M | rv = (compoundflag && !onlycpdrule && i < word.size()) |
1977 | 19.3M | ? affix_check(word, i, word.size() - i, compoundflag, |
1978 | 17.9M | IN_CPD_END) |
1979 | 19.3M | : NULL; |
1980 | 19.3M | if (!rv && compoundend && !onlycpdrule) { |
1981 | 658k | sfx = NULL; |
1982 | 658k | pfx = NULL; |
1983 | 658k | if (i < word.size()) |
1984 | 657k | rv = affix_check(word, i, word.size() - i, compoundend, IN_CPD_END); |
1985 | 658k | } |
1986 | | |
1987 | 19.3M | if (!rv && !defcpdtable.empty() && words) { |
1988 | 764k | if (i < word.size()) |
1989 | 764k | rv = affix_check(word, i, word.size() - i, 0, IN_CPD_END); |
1990 | 764k | if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) |
1991 | 43 | return rv_first; |
1992 | 764k | rv = NULL; |
1993 | 764k | } |
1994 | | |
1995 | | // test CHECKCOMPOUNDPATTERN conditions (allowed forms) |
1996 | 19.3M | if (rv && |
1997 | 19.3M | !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || |
1998 | 75.6k | TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) |
1999 | 5.69k | rv = NULL; |
2000 | | |
2001 | | // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) |
2002 | 19.3M | if (rv && !checkcpdtable.empty() && scpd == 0 && |
2003 | 19.3M | cpdpat_check(word, i, rv_first, rv, affixed)) |
2004 | 4.87k | rv = NULL; |
2005 | | |
2006 | | // check non_compound flag in suffix and prefix |
2007 | 19.3M | if ((rv) && ((pfx && pfx->getCont() && |
2008 | 65.0k | TESTAFF(pfx->getCont(), compoundforbidflag, |
2009 | 65.0k | pfx->getContLen())) || |
2010 | 65.0k | (sfx && sfx->getCont() && |
2011 | 63.9k | TESTAFF(sfx->getCont(), compoundforbidflag, |
2012 | 63.9k | sfx->getContLen())))) { |
2013 | 1.42k | rv = NULL; |
2014 | 1.42k | } |
2015 | | |
2016 | | // check FORCEUCASE |
2017 | 19.3M | if (rv && forceucase && |
2018 | 19.3M | (TESTAFF(rv->astr, forceucase, rv->alen)) && |
2019 | 19.3M | !(info && *info & SPELL_ORIGCAP)) |
2020 | 4.67k | rv = NULL; |
2021 | | |
2022 | | // check forbiddenwords |
2023 | 19.3M | if ((rv) && (rv->astr) && |
2024 | 19.3M | (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
2025 | 58.3k | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
2026 | 58.3k | (is_sug && nosuggest && |
2027 | 51.0k | TESTAFF(rv->astr, nosuggest, rv->alen)))) |
2028 | 7.69k | return NULL; |
2029 | | |
2030 | | // pfxappnd = prefix of word+i, or NULL |
2031 | | // calculate syllable number of prefix. |
2032 | | // hungarian convention: when syllable number of prefix is more, |
2033 | | // than 1, the prefix+word counts as two words. |
2034 | | |
2035 | 19.3M | if (langnum == LANG_hu) { |
2036 | 5.53M | if (i < word.size()) { |
2037 | | // calculate syllable number of the word |
2038 | 5.37M | numsyllable += get_syllable(word.substr(i)); |
2039 | 5.37M | } |
2040 | | |
2041 | | // - affix syllable num. |
2042 | | // XXX only second suffix (inflections, not derivations) |
2043 | 5.53M | if (sfxappnd) { |
2044 | 1.10k | std::string tmp(sfxappnd); |
2045 | 1.10k | reverseword(tmp); |
2046 | 1.10k | numsyllable -= short(get_syllable(tmp) + sfxextra); |
2047 | 5.53M | } else { |
2048 | 5.53M | numsyllable -= short(sfxextra); |
2049 | 5.53M | } |
2050 | | |
2051 | | // + 1 word, if syllable number of the prefix > 1 (hungarian |
2052 | | // convention) |
2053 | 5.53M | if (pfx && (get_syllable(pfx->getKey()) > 1)) |
2054 | 722 | wordnum++; |
2055 | | |
2056 | | // increment syllable num, if last word has a SYLLABLENUM flag |
2057 | | // and the suffix is beginning `s' |
2058 | | |
2059 | 5.53M | if (!cpdsyllablenum.empty()) { |
2060 | 253 | switch (sfxflag) { |
2061 | 0 | case 'c': { |
2062 | 0 | numsyllable += 2; |
2063 | 0 | break; |
2064 | 0 | } |
2065 | 0 | case 'J': { |
2066 | 0 | numsyllable += 1; |
2067 | 0 | break; |
2068 | 0 | } |
2069 | 0 | case 'I': { |
2070 | 0 | if (rv && TESTAFF(rv->astr, 'J', rv->alen)) |
2071 | 0 | numsyllable += 1; |
2072 | 0 | break; |
2073 | 0 | } |
2074 | 253 | } |
2075 | 253 | } |
2076 | 5.53M | } |
2077 | | |
2078 | | // increment word number, if the second word has a compoundroot flag |
2079 | 19.3M | if ((rv) && (compoundroot) && |
2080 | 19.3M | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
2081 | 9.52k | wordnum++; |
2082 | 9.52k | } |
2083 | | // second word is acceptable, as a word with prefix or/and suffix? |
2084 | | // hungarian conventions: compounding is acceptable, |
2085 | | // when compound forms consist 2 word, otherwise |
2086 | | // the syllable number of root words is 6, or lesser. |
2087 | 19.3M | if ((rv) && |
2088 | 19.3M | (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || |
2089 | 51.3k | ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && |
2090 | 19.3M | ((!checkcompounddup || (rv != rv_first)))) { |
2091 | | // forbid compound word, if it is a non-compound word with typical |
2092 | | // fault |
2093 | 29.4k | if ((checkcompoundrep && cpdrep_check(word, len)) || |
2094 | 29.4k | cpdwordpair_check(word, len)) |
2095 | 11.6k | return NULL; |
2096 | 17.7k | return rv_first; |
2097 | 29.4k | } |
2098 | | |
2099 | 19.2M | numsyllable = oldnumsyllable2; |
2100 | 19.2M | wordnum = oldwordnum2; |
2101 | | |
2102 | | // perhaps second word is a compound word (recursive call) |
2103 | | // (only if SPELL_COMPOUND_2 is not set and maxwordnum is not exceeded) |
2104 | 19.2M | if ((!info || !(*info & SPELL_COMPOUND_2)) && wordnum + 2 < maxwordnum) { |
2105 | 14.5M | rv = compound_check(st.substr(i), wordnum + 1, |
2106 | 14.5M | numsyllable, maxwordnum, wnum + 1, words, rwords, 0, |
2107 | 14.5M | is_sug, info); |
2108 | | |
2109 | 14.5M | if (rv && !checkcpdtable.empty() && i < word.size() && |
2110 | 14.5M | ((scpd == 0 && |
2111 | 397k | cpdpat_check(word, i, rv_first, rv, affixed)) || |
2112 | 397k | (scpd != 0 && |
2113 | 381k | !cpdpat_check(word, i, rv_first, rv, affixed)))) |
2114 | 25.5k | rv = NULL; |
2115 | 14.5M | } else { |
2116 | 4.76M | rv = NULL; |
2117 | 4.76M | } |
2118 | 19.2M | if (rv) { |
2119 | | // forbid compound word, if it is a non-compound word with typical |
2120 | | // fault, or a dictionary word pair |
2121 | | |
2122 | 553k | if (cpdwordpair_check(word, len)) |
2123 | 49.9k | return NULL; |
2124 | | |
2125 | 504k | if (checkcompoundrep || forbiddenword) { |
2126 | | |
2127 | 503k | if (checkcompoundrep && cpdrep_check(word, len)) |
2128 | 626 | return NULL; |
2129 | | |
2130 | | // check first part |
2131 | 503k | if (i < word.size() && word.compare(i, rv->blen, rv->word, rv->blen) == 0) { |
2132 | 97.9k | char r = st[i + rv->blen]; |
2133 | 97.9k | st[i + rv->blen] = '\0'; |
2134 | | |
2135 | 97.9k | if ((checkcompoundrep && cpdrep_check(st, i + rv->blen)) || |
2136 | 97.9k | cpdwordpair_check(st, i + rv->blen)) { |
2137 | 4.95k | st[ + i + rv->blen] = r; |
2138 | 4.95k | continue; |
2139 | 4.95k | } |
2140 | | |
2141 | 93.0k | if (forbiddenword) { |
2142 | 91.1k | struct hentry* rv2 = lookup(word.c_str(), word.size()); |
2143 | 91.1k | if (!rv2 && len <= word.size()) |
2144 | 80.5k | rv2 = affix_check(word, 0, len); |
2145 | 91.1k | if (rv2 && rv2->astr && |
2146 | 91.1k | TESTAFF(rv2->astr, forbiddenword, rv2->alen) && |
2147 | 91.1k | (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) { |
2148 | 113 | return NULL; |
2149 | 113 | } |
2150 | 91.1k | } |
2151 | 92.9k | st[i + rv->blen] = r; |
2152 | 92.9k | } |
2153 | 503k | } |
2154 | 498k | return rv_first; |
2155 | 504k | } |
2156 | 19.2M | } while (striple && !checkedstriple); // end of striple loop |
2157 | | |
2158 | 18.5M | if (checkedstriple) { |
2159 | 202k | i++; |
2160 | 202k | checkedstriple = 0; |
2161 | 202k | striple = 0; |
2162 | 202k | } |
2163 | | |
2164 | 18.5M | } // first word is ok condition |
2165 | | |
2166 | 1.13G | if (soldi != 0) { |
2167 | 9.45M | i = soldi; |
2168 | 9.45M | soldi = 0; |
2169 | 9.45M | len = oldlen; |
2170 | 9.45M | cmin = oldcmin; |
2171 | 9.45M | cmax = oldcmax; |
2172 | 9.45M | } |
2173 | 1.13G | scpd++; |
2174 | | |
2175 | 1.13G | } while (!onlycpdrule && simplifiedcpd && |
2176 | 1.13G | scpd <= checkcpdtable.size()); // end of simplifiedcpd loop |
2177 | | |
2178 | 1.23G | scpd = 0; |
2179 | 1.23G | wordnum = oldwordnum; |
2180 | 1.23G | numsyllable = oldnumsyllable; |
2181 | | |
2182 | 1.23G | if (soldi != 0) { |
2183 | 9.78k | i = soldi; |
2184 | 9.78k | st.assign(word); // XXX add more optim. |
2185 | 9.78k | soldi = 0; |
2186 | 9.78k | len = oldlen; |
2187 | 9.78k | cmin = oldcmin; |
2188 | 9.78k | cmax = oldcmax; |
2189 | 9.78k | } else |
2190 | 1.23G | st[i] = ch; |
2191 | | |
2192 | 1.23G | } while (!defcpdtable.empty() && oldwordnum == 0 && |
2193 | 1.23G | onlycpdrule++ < 1); // end of onlycpd loop |
2194 | 1.14G | } |
2195 | | |
2196 | 28.9M | return NULL; |
2197 | 30.7M | } |
2198 | | |
2199 | | // check if compound word is correctly spelled |
2200 | | // hu_mov_rule = spec. Hungarian rule (XXX) |
2201 | | int AffixMgr::compound_check_morph(const std::string& word, |
2202 | | short wordnum, |
2203 | | short numsyllable, |
2204 | | short maxwordnum, |
2205 | | short wnum, |
2206 | | hentry** words, |
2207 | | hentry** rwords, |
2208 | | char hu_mov_rule, |
2209 | | std::string& result, |
2210 | 0 | const std::string* partresult) { |
2211 | 0 | short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; |
2212 | 0 | hentry* rv = NULL, *rv_first; |
2213 | 0 | std::string st, presult; |
2214 | 0 | char ch, affixed = 0; |
2215 | 0 | int checked_prefix, ok = 0; |
2216 | 0 | size_t cmin, cmax; |
2217 | 0 | hentry** oldwords = words; |
2218 | 0 | size_t len = word.size(); |
2219 | | |
2220 | | // add a time limit to handle possible |
2221 | | // combinatorical explosion of the overlapping words |
2222 | |
|
2223 | 0 | HUNSPELL_THREAD_LOCAL std::chrono::steady_clock::time_point clock_time_start; |
2224 | 0 | HUNSPELL_THREAD_LOCAL bool timelimit_exceeded; |
2225 | | |
2226 | | // get the current time |
2227 | 0 | std::chrono::steady_clock::time_point clock_now = std::chrono::steady_clock::now(); |
2228 | |
|
2229 | 0 | if (wordnum == 0) { |
2230 | | // set the start time |
2231 | 0 | clock_time_start = clock_now; |
2232 | 0 | timelimit_exceeded = false; |
2233 | 0 | } |
2234 | 0 | else if (std::chrono::duration_cast<std::chrono::milliseconds>(clock_now - clock_time_start).count() |
2235 | 0 | > static_cast<double>(TIMELIMIT) * CLOCKS_PER_SEC * 1000) |
2236 | 0 | timelimit_exceeded = true; |
2237 | |
|
2238 | 0 | setcminmax(&cmin, &cmax, word.c_str(), len); |
2239 | |
|
2240 | 0 | st.assign(word); |
2241 | |
|
2242 | 0 | for (size_t i = cmin; i < cmax; ++i) { |
2243 | | // go to end of the UTF-8 character |
2244 | 0 | if (utf8) { |
2245 | 0 | for (; (st[i] & 0xc0) == 0x80; i++) |
2246 | 0 | ; |
2247 | 0 | if (i >= cmax) |
2248 | 0 | return 0; |
2249 | 0 | } |
2250 | | |
2251 | 0 | words = oldwords; |
2252 | 0 | int onlycpdrule = (words) ? 1 : 0; |
2253 | |
|
2254 | 0 | do { // onlycpdrule loop |
2255 | |
|
2256 | 0 | if (timelimit_exceeded) |
2257 | 0 | return 0; |
2258 | | |
2259 | 0 | oldnumsyllable = numsyllable; |
2260 | 0 | oldwordnum = wordnum; |
2261 | 0 | checked_prefix = 0; |
2262 | |
|
2263 | 0 | ch = st[i]; |
2264 | 0 | st[i] = '\0'; |
2265 | 0 | sfx = NULL; |
2266 | | |
2267 | | // FIRST WORD |
2268 | |
|
2269 | 0 | affixed = 1; |
2270 | |
|
2271 | 0 | presult.clear(); |
2272 | 0 | if (partresult) |
2273 | 0 | presult.append(*partresult); |
2274 | |
|
2275 | 0 | rv = lookup(st.c_str(), i); // perhaps without prefix |
2276 | | |
2277 | | // forbid dictionary stems with COMPOUNDFORBIDFLAG in |
2278 | | // compound words, overriding the effect of COMPOUNDPERMITFLAG |
2279 | 0 | if ((rv) && compoundforbidflag && |
2280 | 0 | TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) |
2281 | 0 | continue; |
2282 | | |
2283 | | // search homonym with compound flag |
2284 | 0 | while ((rv) && !hu_mov_rule && |
2285 | 0 | ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
2286 | 0 | !((compoundflag && !words && !onlycpdrule && |
2287 | 0 | TESTAFF(rv->astr, compoundflag, rv->alen)) || |
2288 | 0 | (compoundbegin && !wordnum && !onlycpdrule && |
2289 | 0 | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
2290 | 0 | (compoundmiddle && wordnum && !words && !onlycpdrule && |
2291 | 0 | TESTAFF(rv->astr, compoundmiddle, rv->alen)) || |
2292 | 0 | (!defcpdtable.empty() && onlycpdrule && |
2293 | 0 | ((!words && !wordnum && |
2294 | 0 | defcpd_check(&words, wnum, rv, rwords, 0)) || |
2295 | 0 | (words && |
2296 | 0 | defcpd_check(&words, wnum, rv, rwords, 0))))))) { |
2297 | 0 | rv = rv->next_homonym; |
2298 | 0 | } |
2299 | | |
2300 | |
|
2301 | 0 | if (rv) |
2302 | 0 | affixed = 0; |
2303 | |
|
2304 | 0 | if (rv) { |
2305 | 0 | presult.push_back(MSEP_FLD); |
2306 | 0 | presult.append(MORPH_PART); |
2307 | 0 | presult.append(st, 0, i); |
2308 | 0 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
2309 | 0 | presult.push_back(MSEP_FLD); |
2310 | 0 | presult.append(MORPH_STEM); |
2311 | 0 | presult.append(st, 0, i); |
2312 | 0 | } |
2313 | 0 | if (HENTRY_DATA(rv)) { |
2314 | 0 | presult.push_back(MSEP_FLD); |
2315 | 0 | presult.append(HENTRY_DATA2(rv)); |
2316 | 0 | } |
2317 | 0 | } |
2318 | |
|
2319 | 0 | if (!rv) { |
2320 | 0 | if (compoundflag && |
2321 | 0 | !(rv = |
2322 | 0 | prefix_check(st, 0, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
2323 | 0 | compoundflag))) { |
2324 | 0 | if (((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL, |
2325 | 0 | compoundflag, |
2326 | 0 | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
2327 | 0 | (compoundmoresuffixes && |
2328 | 0 | (rv = suffix_check_twosfx(st, 0, i, 0, NULL, compoundflag)))) && |
2329 | 0 | !hu_mov_rule && sfx->getCont() && |
2330 | 0 | ((compoundforbidflag && |
2331 | 0 | TESTAFF(sfx->getCont(), compoundforbidflag, |
2332 | 0 | sfx->getContLen())) || |
2333 | 0 | (compoundend && |
2334 | 0 | TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { |
2335 | 0 | rv = NULL; |
2336 | 0 | } |
2337 | 0 | } |
2338 | |
|
2339 | 0 | if (rv || |
2340 | 0 | (((wordnum == 0) && compoundbegin && |
2341 | 0 | ((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL, |
2342 | 0 | compoundbegin, |
2343 | 0 | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
2344 | 0 | (compoundmoresuffixes && |
2345 | 0 | (rv = suffix_check_twosfx( |
2346 | 0 | st, 0, i, 0, NULL, |
2347 | 0 | compoundbegin))) || // twofold suffix+compound |
2348 | 0 | (rv = prefix_check(st, 0, i, |
2349 | 0 | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
2350 | 0 | compoundbegin)))) || |
2351 | 0 | ((wordnum > 0) && compoundmiddle && |
2352 | 0 | ((rv = suffix_check(st, 0, i, 0, NULL, FLAG_NULL, |
2353 | 0 | compoundmiddle, |
2354 | 0 | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
2355 | 0 | (compoundmoresuffixes && |
2356 | 0 | (rv = suffix_check_twosfx( |
2357 | 0 | st, 0, i, 0, NULL, |
2358 | 0 | compoundmiddle))) || // twofold suffix+compound |
2359 | 0 | (rv = prefix_check(st, 0, i, |
2360 | 0 | hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, |
2361 | 0 | compoundmiddle)))))) { |
2362 | 0 | std::string p; |
2363 | 0 | if (compoundflag) |
2364 | 0 | p = affix_check_morph(st, 0, i, compoundflag); |
2365 | 0 | if (p.empty()) { |
2366 | 0 | if ((wordnum == 0) && compoundbegin) { |
2367 | 0 | p = affix_check_morph(st, 0, i, compoundbegin); |
2368 | 0 | } else if ((wordnum > 0) && compoundmiddle) { |
2369 | 0 | p = affix_check_morph(st, 0, i, compoundmiddle); |
2370 | 0 | } |
2371 | 0 | } |
2372 | 0 | if (!p.empty()) { |
2373 | 0 | presult.push_back(MSEP_FLD); |
2374 | 0 | presult.append(MORPH_PART); |
2375 | 0 | presult.append(st, 0, i); |
2376 | 0 | line_uniq_app(p, MSEP_REC); |
2377 | 0 | presult.append(p); |
2378 | 0 | } |
2379 | 0 | checked_prefix = 1; |
2380 | 0 | } |
2381 | | // else check forbiddenwords |
2382 | 0 | } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
2383 | 0 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
2384 | 0 | TESTAFF(rv->astr, needaffix, rv->alen))) { |
2385 | 0 | st[i] = ch; |
2386 | 0 | continue; |
2387 | 0 | } |
2388 | | |
2389 | | // check non_compound flag in suffix and prefix |
2390 | 0 | if ((rv) && !hu_mov_rule && |
2391 | 0 | ((pfx && pfx->getCont() && |
2392 | 0 | TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || |
2393 | 0 | (sfx && sfx->getCont() && |
2394 | 0 | TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) { |
2395 | 0 | continue; |
2396 | 0 | } |
2397 | | |
2398 | | // check compoundend flag in suffix and prefix |
2399 | 0 | if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && |
2400 | 0 | ((pfx && pfx->getCont() && |
2401 | 0 | TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) || |
2402 | 0 | (sfx && sfx->getCont() && |
2403 | 0 | TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) { |
2404 | 0 | continue; |
2405 | 0 | } |
2406 | | |
2407 | | // check compoundmiddle flag in suffix and prefix |
2408 | 0 | if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle && |
2409 | 0 | !hu_mov_rule && |
2410 | 0 | ((pfx && pfx->getCont() && |
2411 | 0 | TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) || |
2412 | 0 | (sfx && sfx->getCont() && |
2413 | 0 | TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) { |
2414 | 0 | rv = NULL; |
2415 | 0 | } |
2416 | | |
2417 | | // check forbiddenwords |
2418 | 0 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
2419 | 0 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) |
2420 | 0 | continue; |
2421 | | |
2422 | | // increment word number, if the second root has a compoundroot flag |
2423 | 0 | if ((rv) && (compoundroot) && |
2424 | 0 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
2425 | 0 | wordnum++; |
2426 | 0 | } |
2427 | | |
2428 | | // first word is acceptable in compound words? |
2429 | 0 | if (((rv) && |
2430 | 0 | (checked_prefix || (words && words[wnum]) || |
2431 | 0 | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
2432 | 0 | ((oldwordnum == 0) && compoundbegin && |
2433 | 0 | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
2434 | 0 | ((oldwordnum > 0) && compoundmiddle && |
2435 | 0 | TESTAFF(rv->astr, compoundmiddle, rv->alen)) |
2436 | | // LANG_hu section: spec. Hungarian rule |
2437 | 0 | || ((langnum == LANG_hu) && // hu_mov_rule |
2438 | 0 | hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) || |
2439 | 0 | TESTAFF(rv->astr, 'G', rv->alen) || |
2440 | 0 | TESTAFF(rv->astr, 'H', rv->alen))) |
2441 | | // END of LANG_hu section |
2442 | 0 | ) && |
2443 | 0 | !((checkcompoundtriple && !words && // test triple letters |
2444 | 0 | (word[i - 1] == word[i]) && |
2445 | 0 | (((i > 1) && (word[i - 1] == word[i - 2])) || |
2446 | 0 | ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' |
2447 | 0 | )) || |
2448 | 0 | ( |
2449 | | // test CHECKCOMPOUNDPATTERN |
2450 | 0 | !checkcpdtable.empty() && !words && |
2451 | 0 | cpdpat_check(word, i, rv, NULL, affixed)) || |
2452 | 0 | (checkcompoundcase && !words && cpdcase_check(word, i)))) |
2453 | | // LANG_hu section: spec. Hungarian rule |
2454 | 0 | || |
2455 | 0 | ((!rv) && (langnum == LANG_hu) && hu_mov_rule && |
2456 | 0 | (rv = affix_check(st, 0, i)) && |
2457 | 0 | (sfx && sfx->getCont() && |
2458 | 0 | (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) || |
2459 | 0 | TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen())))) |
2460 | | // END of LANG_hu section |
2461 | 0 | ) { |
2462 | | // LANG_hu section: spec. Hungarian rule |
2463 | 0 | if (langnum == LANG_hu) { |
2464 | | // calculate syllable number of the word |
2465 | 0 | numsyllable += get_syllable(st.substr(0, i)); |
2466 | | |
2467 | | // + 1 word, if syllable number of the prefix > 1 (hungarian |
2468 | | // convention) |
2469 | 0 | if (pfx && (get_syllable(pfx->getKey()) > 1)) |
2470 | 0 | wordnum++; |
2471 | 0 | } |
2472 | | // END of LANG_hu section |
2473 | | |
2474 | | // NEXT WORD(S) |
2475 | 0 | rv_first = rv; |
2476 | 0 | rv = lookup(word.c_str() + i, word.size() - i); // perhaps without prefix |
2477 | | |
2478 | | // search homonym with compound flag |
2479 | 0 | while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
2480 | 0 | !((compoundflag && !words && |
2481 | 0 | TESTAFF(rv->astr, compoundflag, rv->alen)) || |
2482 | 0 | (compoundend && !words && |
2483 | 0 | TESTAFF(rv->astr, compoundend, rv->alen)) || |
2484 | 0 | (!defcpdtable.empty() && words && |
2485 | 0 | defcpd_check(&words, wnum + 1, rv, NULL, 1))))) { |
2486 | 0 | rv = rv->next_homonym; |
2487 | 0 | } |
2488 | |
|
2489 | 0 | if (rv && words && words[wnum + 1]) { |
2490 | 0 | result.append(presult); |
2491 | 0 | result.push_back(MSEP_FLD); |
2492 | 0 | result.append(MORPH_PART); |
2493 | 0 | result.append(word, i, word.size()); |
2494 | 0 | if (complexprefixes && HENTRY_DATA(rv)) |
2495 | 0 | result.append(HENTRY_DATA2(rv)); |
2496 | 0 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
2497 | 0 | result.push_back(MSEP_FLD); |
2498 | 0 | result.append(MORPH_STEM); |
2499 | 0 | result.append(HENTRY_WORD(rv)); |
2500 | 0 | } |
2501 | | // store the pointer of the hash entry |
2502 | 0 | if (!complexprefixes && HENTRY_DATA(rv)) { |
2503 | 0 | result.push_back(MSEP_FLD); |
2504 | 0 | result.append(HENTRY_DATA2(rv)); |
2505 | 0 | } |
2506 | 0 | result.push_back(MSEP_REC); |
2507 | 0 | return 0; |
2508 | 0 | } |
2509 | | |
2510 | 0 | oldnumsyllable2 = numsyllable; |
2511 | 0 | oldwordnum2 = wordnum; |
2512 | | |
2513 | | // LANG_hu section: spec. Hungarian rule |
2514 | 0 | if ((rv) && (langnum == LANG_hu) && |
2515 | 0 | (TESTAFF(rv->astr, 'I', rv->alen)) && |
2516 | 0 | !(TESTAFF(rv->astr, 'J', rv->alen))) { |
2517 | 0 | numsyllable--; |
2518 | 0 | } |
2519 | | // END of LANG_hu section |
2520 | | // increment word number, if the second root has a compoundroot flag |
2521 | 0 | if ((rv) && (compoundroot) && |
2522 | 0 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
2523 | 0 | wordnum++; |
2524 | 0 | } |
2525 | | |
2526 | | // check forbiddenwords |
2527 | 0 | if ((rv) && (rv->astr) && |
2528 | 0 | (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
2529 | 0 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { |
2530 | 0 | st[i] = ch; |
2531 | 0 | continue; |
2532 | 0 | } |
2533 | | |
2534 | | // second word is acceptable, as a root? |
2535 | | // hungarian conventions: compounding is acceptable, |
2536 | | // when compound forms consist of 2 words, or if more, |
2537 | | // then the syllable number of root words must be 6, or lesser. |
2538 | 0 | if ((rv) && |
2539 | 0 | ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
2540 | 0 | (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && |
2541 | 0 | (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || |
2542 | 0 | ((cpdmaxsyllable != 0) && |
2543 | 0 | (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= |
2544 | 0 | cpdmaxsyllable))) && |
2545 | 0 | ((!checkcompounddup || (rv != rv_first)))) { |
2546 | | // bad compound word |
2547 | 0 | result.append(presult); |
2548 | 0 | result.push_back(MSEP_FLD); |
2549 | 0 | result.append(MORPH_PART); |
2550 | 0 | result.append(word, i, word.size()); |
2551 | |
|
2552 | 0 | if (HENTRY_DATA(rv)) { |
2553 | 0 | if (complexprefixes) |
2554 | 0 | result.append(HENTRY_DATA2(rv)); |
2555 | 0 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
2556 | 0 | result.push_back(MSEP_FLD); |
2557 | 0 | result.append(MORPH_STEM); |
2558 | 0 | result.append(HENTRY_WORD(rv)); |
2559 | 0 | } |
2560 | | // store the pointer of the hash entry |
2561 | 0 | if (!complexprefixes) { |
2562 | 0 | result.push_back(MSEP_FLD); |
2563 | 0 | result.append(HENTRY_DATA2(rv)); |
2564 | 0 | } |
2565 | 0 | } |
2566 | 0 | result.push_back(MSEP_REC); |
2567 | 0 | ok = 1; |
2568 | 0 | } |
2569 | |
|
2570 | 0 | numsyllable = oldnumsyllable2; |
2571 | 0 | wordnum = oldwordnum2; |
2572 | | |
2573 | | // perhaps second word has prefix or/and suffix |
2574 | 0 | sfx = NULL; |
2575 | 0 | sfxflag = FLAG_NULL; |
2576 | |
|
2577 | 0 | if (compoundflag && !onlycpdrule) |
2578 | 0 | rv = affix_check(word, i, word.size() - i, compoundflag); |
2579 | 0 | else |
2580 | 0 | rv = NULL; |
2581 | |
|
2582 | 0 | if (!rv && compoundend && !onlycpdrule) { |
2583 | 0 | sfx = NULL; |
2584 | 0 | pfx = NULL; |
2585 | 0 | rv = affix_check(word, i, word.size() - i, compoundend); |
2586 | 0 | } |
2587 | |
|
2588 | 0 | if (!rv && !defcpdtable.empty() && words) { |
2589 | 0 | rv = affix_check(word, i, word.size() - i, 0, IN_CPD_END); |
2590 | 0 | if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { |
2591 | 0 | std::string m; |
2592 | 0 | if (compoundflag) |
2593 | 0 | m = affix_check_morph(word, i, word.size() - i, compoundflag); |
2594 | 0 | if (m.empty() && compoundend) { |
2595 | 0 | m = affix_check_morph(word, i, word.size() - i, compoundend); |
2596 | 0 | } |
2597 | 0 | result.append(presult); |
2598 | 0 | if (!m.empty()) { |
2599 | 0 | result.push_back(MSEP_FLD); |
2600 | 0 | result.append(MORPH_PART); |
2601 | 0 | result.append(word, i, word.size()); |
2602 | 0 | line_uniq_app(m, MSEP_REC); |
2603 | 0 | result.append(m); |
2604 | 0 | } |
2605 | 0 | result.push_back(MSEP_REC); |
2606 | 0 | ok = 1; |
2607 | 0 | } |
2608 | 0 | } |
2609 | | |
2610 | | // check non_compound flag in suffix and prefix |
2611 | 0 | if ((rv) && |
2612 | 0 | ((pfx && pfx->getCont() && |
2613 | 0 | TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) || |
2614 | 0 | (sfx && sfx->getCont() && |
2615 | 0 | TESTAFF(sfx->getCont(), compoundforbidflag, |
2616 | 0 | sfx->getContLen())))) { |
2617 | 0 | rv = NULL; |
2618 | 0 | } |
2619 | | |
2620 | | // check forbiddenwords |
2621 | 0 | if ((rv) && (rv->astr) && |
2622 | 0 | (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
2623 | 0 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) && |
2624 | 0 | (!TESTAFF(rv->astr, needaffix, rv->alen))) { |
2625 | 0 | st[i] = ch; |
2626 | 0 | continue; |
2627 | 0 | } |
2628 | | |
2629 | 0 | if (langnum == LANG_hu) { |
2630 | | // calculate syllable number of the word |
2631 | 0 | numsyllable += get_syllable(word.c_str() + i); |
2632 | | |
2633 | | // - affix syllable num. |
2634 | | // XXX only second suffix (inflections, not derivations) |
2635 | 0 | if (sfxappnd) { |
2636 | 0 | std::string tmp(sfxappnd); |
2637 | 0 | reverseword(tmp); |
2638 | 0 | numsyllable -= short(get_syllable(tmp) + sfxextra); |
2639 | 0 | } else { |
2640 | 0 | numsyllable -= short(sfxextra); |
2641 | 0 | } |
2642 | | |
2643 | | // + 1 word, if syllable number of the prefix > 1 (hungarian |
2644 | | // convention) |
2645 | 0 | if (pfx && (get_syllable(pfx->getKey()) > 1)) |
2646 | 0 | wordnum++; |
2647 | | |
2648 | | // increment syllable num, if last word has a SYLLABLENUM flag |
2649 | | // and the suffix is beginning `s' |
2650 | |
|
2651 | 0 | if (!cpdsyllablenum.empty()) { |
2652 | 0 | switch (sfxflag) { |
2653 | 0 | case 'c': { |
2654 | 0 | numsyllable += 2; |
2655 | 0 | break; |
2656 | 0 | } |
2657 | 0 | case 'J': { |
2658 | 0 | numsyllable += 1; |
2659 | 0 | break; |
2660 | 0 | } |
2661 | 0 | case 'I': { |
2662 | 0 | if (rv && TESTAFF(rv->astr, 'J', rv->alen)) |
2663 | 0 | numsyllable += 1; |
2664 | 0 | break; |
2665 | 0 | } |
2666 | 0 | } |
2667 | 0 | } |
2668 | 0 | } |
2669 | | |
2670 | | // increment word number, if the second word has a compoundroot flag |
2671 | 0 | if ((rv) && (compoundroot) && |
2672 | 0 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
2673 | 0 | wordnum++; |
2674 | 0 | } |
2675 | | // second word is acceptable, as a word with prefix or/and suffix? |
2676 | | // hungarian conventions: compounding is acceptable, |
2677 | | // when compound forms consist 2 word, otherwise |
2678 | | // the syllable number of root words is 6, or lesser. |
2679 | 0 | if ((rv) && |
2680 | 0 | (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || |
2681 | 0 | ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && |
2682 | 0 | ((!checkcompounddup || (rv != rv_first)))) { |
2683 | 0 | std::string m; |
2684 | 0 | if (compoundflag) |
2685 | 0 | m = affix_check_morph(word, i, word.size() - i, compoundflag); |
2686 | 0 | if (m.empty() && compoundend) { |
2687 | 0 | m = affix_check_morph(word, i, word.size() - i, compoundend); |
2688 | 0 | } |
2689 | 0 | result.append(presult); |
2690 | 0 | if (!m.empty()) { |
2691 | 0 | result.push_back(MSEP_FLD); |
2692 | 0 | result.append(MORPH_PART); |
2693 | 0 | result.append(word, i, word.size()); |
2694 | 0 | line_uniq_app(m, MSEP_REC); |
2695 | 0 | result.push_back(MSEP_FLD); |
2696 | 0 | result.append(m); |
2697 | 0 | } |
2698 | 0 | result.push_back(MSEP_REC); |
2699 | 0 | ok = 1; |
2700 | 0 | } |
2701 | |
|
2702 | 0 | numsyllable = oldnumsyllable2; |
2703 | 0 | wordnum = oldwordnum2; |
2704 | | |
2705 | | // perhaps second word is a compound word (recursive call) |
2706 | 0 | if ((wordnum + 2 < maxwordnum) && (ok == 0)) { |
2707 | 0 | compound_check_morph(word.substr(i), wordnum + 1, |
2708 | 0 | numsyllable, maxwordnum, wnum + 1, words, rwords, 0, |
2709 | 0 | result, &presult); |
2710 | 0 | } else { |
2711 | 0 | rv = NULL; |
2712 | 0 | } |
2713 | 0 | } |
2714 | 0 | st[i] = ch; |
2715 | 0 | wordnum = oldwordnum; |
2716 | 0 | numsyllable = oldnumsyllable; |
2717 | |
|
2718 | 0 | } while (!defcpdtable.empty() && oldwordnum == 0 && |
2719 | 0 | onlycpdrule++ < 1); // end of onlycpd loop |
2720 | 0 | } |
2721 | 0 | return 0; |
2722 | 0 | } |
2723 | | |
2724 | | |
2725 | | inline int AffixMgr::isRevSubset(const char* s1, |
2726 | | const char* end_of_s2, |
2727 | 96.2M | int len) { |
2728 | 120M | while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) { |
2729 | 24.2M | s1++; |
2730 | 24.2M | end_of_s2--; |
2731 | 24.2M | len--; |
2732 | 24.2M | } |
2733 | 96.2M | return (*s1 == '\0'); |
2734 | 96.2M | } |
2735 | | |
2736 | | // check word for suffixes |
2737 | | struct hentry* AffixMgr::suffix_check(const std::string& word, |
2738 | | int start, |
2739 | | int len, |
2740 | | int sfxopts, |
2741 | | PfxEntry* ppfx, |
2742 | | const FLAG cclass, |
2743 | | const FLAG needflag, |
2744 | 1.41G | char in_compound) { |
2745 | 1.41G | struct hentry* rv = NULL; |
2746 | 1.41G | PfxEntry* ep = ppfx; |
2747 | | |
2748 | | // first handle the special case of 0 length suffixes |
2749 | 1.41G | SfxEntry* se = sStart[0]; |
2750 | | |
2751 | 2.22G | while (se) { |
2752 | 806M | if (!cclass || se->getCont()) { |
2753 | | // suffixes are not allowed in beginning of compounds |
2754 | 765M | if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
2755 | | // except when signed with compoundpermitflag flag |
2756 | 765M | (se->getCont() && compoundpermitflag && |
2757 | 383M | TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && |
2758 | 765M | (!circumfix || |
2759 | | // no circumfix flag in prefix and suffix |
2760 | 387M | ((!ppfx || !(ep->getCont()) || |
2761 | 21.8M | !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
2762 | 21.8M | (!se->getCont() || |
2763 | 21.7M | !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || |
2764 | | // circumfix flag in prefix AND suffix |
2765 | 387M | ((ppfx && (ep->getCont()) && |
2766 | 307k | TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
2767 | 307k | (se->getCont() && |
2768 | 48.8k | (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && |
2769 | | // fogemorpheme |
2770 | 765M | (in_compound || |
2771 | 387M | !(se->getCont() && |
2772 | 372M | (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) && |
2773 | | // needaffix on prefix or first suffix |
2774 | 765M | (cclass || |
2775 | 347M | !(se->getCont() && |
2776 | 281M | TESTAFF(se->getCont(), needaffix, se->getContLen())) || |
2777 | 347M | (ppfx && |
2778 | 30.1M | !((ep->getCont()) && |
2779 | 317M | TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) { |
2780 | 317M | rv = se->checkword(word, start, len, sfxopts, ppfx, |
2781 | 317M | (FLAG)cclass, needflag, |
2782 | 317M | (in_compound ? 0 : onlyincompound)); |
2783 | 317M | if (rv) { |
2784 | 322k | sfx = se; // BUG: sfx not stateless |
2785 | 322k | return rv; |
2786 | 322k | } |
2787 | 317M | } |
2788 | 765M | } |
2789 | 806M | se = se->getNext(); |
2790 | 806M | } |
2791 | | |
2792 | | // now handle the general case |
2793 | 1.41G | if (len == 0) |
2794 | 46.5k | return NULL; // FULLSTRIP |
2795 | 1.41G | unsigned char sp = word[start + len - 1]; |
2796 | 1.41G | SfxEntry* sptr = sStart[sp]; |
2797 | | |
2798 | 1.49G | while (sptr) { |
2799 | 83.8M | if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) { |
2800 | | // suffixes are not allowed in beginning of compounds |
2801 | 82.4M | if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
2802 | | // except when signed with compoundpermitflag flag |
2803 | 82.4M | (sptr->getCont() && compoundpermitflag && |
2804 | 43.7M | TESTAFF(sptr->getCont(), compoundpermitflag, |
2805 | 43.7M | sptr->getContLen()))) && |
2806 | 82.4M | (!circumfix || |
2807 | | // no circumfix flag in prefix and suffix |
2808 | 40.1M | ((!ppfx || !(ep->getCont()) || |
2809 | 3.92M | !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
2810 | 3.92M | (!sptr->getCont() || |
2811 | 3.91M | !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || |
2812 | | // circumfix flag in prefix AND suffix |
2813 | 40.1M | ((ppfx && (ep->getCont()) && |
2814 | 91.2k | TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
2815 | 91.2k | (sptr->getCont() && |
2816 | 11.3k | (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && |
2817 | | // fogemorpheme |
2818 | 82.4M | (in_compound || |
2819 | 40.0M | !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, |
2820 | 15.6M | sptr->getContLen()))))) && |
2821 | | // needaffix on prefix or first suffix |
2822 | 82.4M | (cclass || |
2823 | 39.7M | !(sptr->getCont() && |
2824 | 28.8M | TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || |
2825 | 39.7M | (ppfx && |
2826 | 3.36M | !((ep->getCont()) && |
2827 | 151k | TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) |
2828 | 36.5M | if (in_compound != IN_CPD_END || ppfx || |
2829 | 36.5M | !(sptr->getCont() && |
2830 | 36.5M | TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { |
2831 | 36.5M | rv = sptr->checkword(word, start, len, sfxopts, ppfx, |
2832 | 36.5M | cclass, needflag, |
2833 | 36.5M | (in_compound ? 0 : onlyincompound)); |
2834 | 36.5M | if (rv) { |
2835 | 39.4k | sfx = sptr; // BUG: sfx not stateless |
2836 | 39.4k | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
2837 | 39.4k | if (!sptr->getCont()) |
2838 | 12.8k | sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless |
2839 | | // LANG_hu section: spec. Hungarian rule |
2840 | 26.6k | else if (langnum == LANG_hu && sptr->getKeyLen() && |
2841 | 26.6k | sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' && |
2842 | 26.6k | sptr->getKey()[1] != 't') { |
2843 | 0 | sfxextra = 1; |
2844 | 0 | } |
2845 | | // END of LANG_hu section |
2846 | 39.4k | return rv; |
2847 | 39.4k | } |
2848 | 36.5M | } |
2849 | 82.4M | sptr = sptr->getNextEQ(); |
2850 | 82.4M | } else { |
2851 | 1.38M | sptr = sptr->getNextNE(); |
2852 | 1.38M | } |
2853 | 83.8M | } |
2854 | | |
2855 | 1.41G | return NULL; |
2856 | 1.41G | } |
2857 | | |
2858 | | // check word for two-level suffixes |
2859 | | struct hentry* AffixMgr::suffix_check_twosfx(const std::string& word, |
2860 | | int start, |
2861 | | int len, |
2862 | | int sfxopts, |
2863 | | PfxEntry* ppfx, |
2864 | 87.9M | const FLAG needflag) { |
2865 | 87.9M | struct hentry* rv = NULL; |
2866 | | |
2867 | | // first handle the special case of 0 length suffixes |
2868 | 87.9M | SfxEntry* se = sStart[0]; |
2869 | 188M | while (se) { |
2870 | 100M | if (contclasses[se->getFlag()]) { |
2871 | 82.1M | rv = se->check_twosfx(word, start, len, sfxopts, ppfx, needflag); |
2872 | 82.1M | if (rv) |
2873 | 245k | return rv; |
2874 | 82.1M | } |
2875 | 100M | se = se->getNext(); |
2876 | 100M | } |
2877 | | |
2878 | | // now handle the general case |
2879 | 87.6M | if (len == 0) |
2880 | 12.9k | return NULL; // FULLSTRIP |
2881 | 87.6M | unsigned char sp = word[start + len - 1]; |
2882 | 87.6M | SfxEntry* sptr = sStart[sp]; |
2883 | | |
2884 | 99.9M | while (sptr) { |
2885 | 12.3M | if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) { |
2886 | 12.1M | if (contclasses[sptr->getFlag()]) { |
2887 | 9.47M | rv = sptr->check_twosfx(word, start, len, sfxopts, ppfx, needflag); |
2888 | 9.47M | if (rv) { |
2889 | 9.12k | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
2890 | 9.12k | if (!sptr->getCont()) |
2891 | 3.89k | sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless |
2892 | 9.12k | return rv; |
2893 | 9.12k | } |
2894 | 9.47M | } |
2895 | 12.1M | sptr = sptr->getNextEQ(); |
2896 | 12.1M | } else { |
2897 | 173k | sptr = sptr->getNextNE(); |
2898 | 173k | } |
2899 | 12.3M | } |
2900 | | |
2901 | 87.6M | return NULL; |
2902 | 87.6M | } |
2903 | | |
2904 | | // check word for two-level suffixes and morph |
2905 | | std::string AffixMgr::suffix_check_twosfx_morph(const std::string& word, |
2906 | | int start, |
2907 | | int len, |
2908 | | int sfxopts, |
2909 | | PfxEntry* ppfx, |
2910 | 0 | const FLAG needflag) { |
2911 | 0 | std::string result; |
2912 | 0 | std::string result2; |
2913 | 0 | std::string result3; |
2914 | | |
2915 | | // first handle the special case of 0 length suffixes |
2916 | 0 | SfxEntry* se = sStart[0]; |
2917 | 0 | while (se) { |
2918 | 0 | if (contclasses[se->getFlag()]) { |
2919 | 0 | std::string st = se->check_twosfx_morph(word, start, len, sfxopts, ppfx, needflag); |
2920 | 0 | if (!st.empty()) { |
2921 | 0 | if (ppfx) { |
2922 | 0 | if (ppfx->getMorph()) { |
2923 | 0 | result.append(ppfx->getMorph()); |
2924 | 0 | result.push_back(MSEP_FLD); |
2925 | 0 | } else |
2926 | 0 | debugflag(result, ppfx->getFlag()); |
2927 | 0 | } |
2928 | 0 | result.append(st); |
2929 | 0 | if (se->getMorph()) { |
2930 | 0 | result.push_back(MSEP_FLD); |
2931 | 0 | result.append(se->getMorph()); |
2932 | 0 | } else |
2933 | 0 | debugflag(result, se->getFlag()); |
2934 | 0 | result.push_back(MSEP_REC); |
2935 | 0 | } |
2936 | 0 | } |
2937 | 0 | se = se->getNext(); |
2938 | 0 | } |
2939 | | |
2940 | | // now handle the general case |
2941 | 0 | if (len == 0) |
2942 | 0 | return { }; // FULLSTRIP |
2943 | 0 | unsigned char sp = word[start + len - 1]; |
2944 | 0 | SfxEntry* sptr = sStart[sp]; |
2945 | |
|
2946 | 0 | while (sptr) { |
2947 | 0 | if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) { |
2948 | 0 | if (contclasses[sptr->getFlag()]) { |
2949 | 0 | std::string st = sptr->check_twosfx_morph(word, start, len, sfxopts, ppfx, needflag); |
2950 | 0 | if (!st.empty()) { |
2951 | 0 | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
2952 | 0 | if (!sptr->getCont()) |
2953 | 0 | sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless |
2954 | 0 | result2.assign(st); |
2955 | |
|
2956 | 0 | result3.clear(); |
2957 | |
|
2958 | 0 | if (sptr->getMorph()) { |
2959 | 0 | result3.push_back(MSEP_FLD); |
2960 | 0 | result3.append(sptr->getMorph()); |
2961 | 0 | } else |
2962 | 0 | debugflag(result3, sptr->getFlag()); |
2963 | 0 | strlinecat(result2, result3); |
2964 | 0 | result2.push_back(MSEP_REC); |
2965 | 0 | result.append(result2); |
2966 | 0 | } |
2967 | 0 | } |
2968 | 0 | sptr = sptr->getNextEQ(); |
2969 | 0 | } else { |
2970 | 0 | sptr = sptr->getNextNE(); |
2971 | 0 | } |
2972 | 0 | } |
2973 | |
|
2974 | 0 | return result; |
2975 | 0 | } |
2976 | | |
2977 | | std::string AffixMgr::suffix_check_morph(const std::string& word, |
2978 | | int start, |
2979 | | int len, |
2980 | | int sfxopts, |
2981 | | PfxEntry* ppfx, |
2982 | | const FLAG cclass, |
2983 | | const FLAG needflag, |
2984 | 0 | char in_compound) { |
2985 | 0 | std::string result; |
2986 | |
|
2987 | 0 | struct hentry* rv = NULL; |
2988 | |
|
2989 | 0 | PfxEntry* ep = ppfx; |
2990 | | |
2991 | | // first handle the special case of 0 length suffixes |
2992 | 0 | SfxEntry* se = sStart[0]; |
2993 | 0 | while (se) { |
2994 | 0 | if (!cclass || se->getCont()) { |
2995 | | // suffixes are not allowed in beginning of compounds |
2996 | 0 | if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
2997 | | // except when signed with compoundpermitflag flag |
2998 | 0 | (se->getCont() && compoundpermitflag && |
2999 | 0 | TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) && |
3000 | 0 | (!circumfix || |
3001 | | // no circumfix flag in prefix and suffix |
3002 | 0 | ((!ppfx || !(ep->getCont()) || |
3003 | 0 | !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
3004 | 0 | (!se->getCont() || |
3005 | 0 | !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) || |
3006 | | // circumfix flag in prefix AND suffix |
3007 | 0 | ((ppfx && (ep->getCont()) && |
3008 | 0 | TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
3009 | 0 | (se->getCont() && |
3010 | 0 | (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) && |
3011 | | // fogemorpheme |
3012 | 0 | (in_compound || |
3013 | 0 | !((se->getCont() && |
3014 | 0 | (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && |
3015 | | // needaffix on prefix or first suffix |
3016 | 0 | (cclass || |
3017 | 0 | !(se->getCont() && |
3018 | 0 | TESTAFF(se->getCont(), needaffix, se->getContLen())) || |
3019 | 0 | (ppfx && |
3020 | 0 | !((ep->getCont()) && |
3021 | 0 | TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))) |
3022 | 0 | rv = se->checkword(word, start, len, sfxopts, ppfx, cclass, |
3023 | 0 | needflag, FLAG_NULL); |
3024 | 0 | while (rv) { |
3025 | 0 | if (ppfx) { |
3026 | 0 | if (ppfx->getMorph()) { |
3027 | 0 | result.append(ppfx->getMorph()); |
3028 | 0 | result.push_back(MSEP_FLD); |
3029 | 0 | } else |
3030 | 0 | debugflag(result, ppfx->getFlag()); |
3031 | 0 | } |
3032 | 0 | if (complexprefixes && HENTRY_DATA(rv)) |
3033 | 0 | result.append(HENTRY_DATA2(rv)); |
3034 | 0 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
3035 | 0 | result.push_back(MSEP_FLD); |
3036 | 0 | result.append(MORPH_STEM); |
3037 | 0 | result.append(HENTRY_WORD(rv)); |
3038 | 0 | } |
3039 | |
|
3040 | 0 | if (!complexprefixes && HENTRY_DATA(rv)) { |
3041 | 0 | result.push_back(MSEP_FLD); |
3042 | 0 | result.append(HENTRY_DATA2(rv)); |
3043 | 0 | } |
3044 | 0 | if (se->getMorph()) { |
3045 | 0 | result.push_back(MSEP_FLD); |
3046 | 0 | result.append(se->getMorph()); |
3047 | 0 | } else |
3048 | 0 | debugflag(result, se->getFlag()); |
3049 | 0 | result.push_back(MSEP_REC); |
3050 | 0 | rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); |
3051 | 0 | } |
3052 | 0 | } |
3053 | 0 | se = se->getNext(); |
3054 | 0 | } |
3055 | | |
3056 | | // now handle the general case |
3057 | 0 | if (len == 0) |
3058 | 0 | return { }; // FULLSTRIP |
3059 | 0 | unsigned char sp = word[start + len - 1]; |
3060 | 0 | SfxEntry* sptr = sStart[sp]; |
3061 | |
|
3062 | 0 | while (sptr) { |
3063 | 0 | if (isRevSubset(sptr->getKey(), word.c_str() + start + len - 1, len)) { |
3064 | | // suffixes are not allowed in beginning of compounds |
3065 | 0 | if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
3066 | | // except when signed with compoundpermitflag flag |
3067 | 0 | (sptr->getCont() && compoundpermitflag && |
3068 | 0 | TESTAFF(sptr->getCont(), compoundpermitflag, |
3069 | 0 | sptr->getContLen()))) && |
3070 | 0 | (!circumfix || |
3071 | | // no circumfix flag in prefix and suffix |
3072 | 0 | ((!ppfx || !(ep->getCont()) || |
3073 | 0 | !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
3074 | 0 | (!sptr->getCont() || |
3075 | 0 | !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) || |
3076 | | // circumfix flag in prefix AND suffix |
3077 | 0 | ((ppfx && (ep->getCont()) && |
3078 | 0 | TESTAFF(ep->getCont(), circumfix, ep->getContLen())) && |
3079 | 0 | (sptr->getCont() && |
3080 | 0 | (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) && |
3081 | | // fogemorpheme |
3082 | 0 | (in_compound || |
3083 | 0 | !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, |
3084 | 0 | sptr->getContLen()))))) && |
3085 | | // needaffix on first suffix |
3086 | 0 | (cclass || |
3087 | 0 | !(sptr->getCont() && |
3088 | 0 | TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))))) |
3089 | 0 | rv = sptr->checkword(word, start, len, sfxopts, ppfx, cclass, |
3090 | 0 | needflag, FLAG_NULL); |
3091 | 0 | while (rv) { |
3092 | 0 | if (ppfx) { |
3093 | 0 | if (ppfx->getMorph()) { |
3094 | 0 | result.append(ppfx->getMorph()); |
3095 | 0 | result.push_back(MSEP_FLD); |
3096 | 0 | } else |
3097 | 0 | debugflag(result, ppfx->getFlag()); |
3098 | 0 | } |
3099 | 0 | if (complexprefixes && HENTRY_DATA(rv)) |
3100 | 0 | result.append(HENTRY_DATA2(rv)); |
3101 | 0 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
3102 | 0 | result.push_back(MSEP_FLD); |
3103 | 0 | result.append(MORPH_STEM); |
3104 | 0 | result.append(HENTRY_WORD(rv)); |
3105 | 0 | } |
3106 | |
|
3107 | 0 | if (!complexprefixes && HENTRY_DATA(rv)) { |
3108 | 0 | result.push_back(MSEP_FLD); |
3109 | 0 | result.append(HENTRY_DATA2(rv)); |
3110 | 0 | } |
3111 | |
|
3112 | 0 | if (sptr->getMorph()) { |
3113 | 0 | result.push_back(MSEP_FLD); |
3114 | 0 | result.append(sptr->getMorph()); |
3115 | 0 | } else |
3116 | 0 | debugflag(result, sptr->getFlag()); |
3117 | 0 | result.push_back(MSEP_REC); |
3118 | 0 | rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); |
3119 | 0 | } |
3120 | 0 | sptr = sptr->getNextEQ(); |
3121 | 0 | } else { |
3122 | 0 | sptr = sptr->getNextNE(); |
3123 | 0 | } |
3124 | 0 | } |
3125 | |
|
3126 | 0 | return result; |
3127 | 0 | } |
3128 | | |
3129 | | // check if word with affixes is correctly spelled |
3130 | | struct hentry* AffixMgr::affix_check(const std::string& word, |
3131 | | int start, |
3132 | | int len, |
3133 | | const FLAG needflag, |
3134 | 41.9M | char in_compound) { |
3135 | | |
3136 | | // check all prefixes (also crossed with suffixes if allowed) |
3137 | 41.9M | struct hentry* rv = prefix_check(word, start, len, in_compound, needflag); |
3138 | 41.9M | if (rv) |
3139 | 161k | return rv; |
3140 | | |
3141 | | // if still not found check all suffixes |
3142 | 41.7M | rv = suffix_check(word, start, len, 0, NULL, FLAG_NULL, needflag, in_compound); |
3143 | | |
3144 | 41.7M | if (havecontclass) { |
3145 | 15.0M | sfx = NULL; |
3146 | 15.0M | pfx = NULL; |
3147 | | |
3148 | 15.0M | if (rv) |
3149 | 38.9k | return rv; |
3150 | | // if still not found check all two-level suffixes |
3151 | 15.0M | rv = suffix_check_twosfx(word, start, len, 0, NULL, needflag); |
3152 | | |
3153 | 15.0M | if (rv) |
3154 | 10.3k | return rv; |
3155 | | // if still not found check all two-level suffixes |
3156 | 15.0M | rv = prefix_check_twosfx(word, start, len, IN_CPD_NOT, needflag); |
3157 | 15.0M | } |
3158 | | |
3159 | 41.7M | return rv; |
3160 | 41.7M | } |
3161 | | |
3162 | | // check if word with affixes is correctly spelled |
3163 | | std::string AffixMgr::affix_check_morph(const std::string& word, |
3164 | | int start, |
3165 | | int len, |
3166 | | const FLAG needflag, |
3167 | 0 | char in_compound) { |
3168 | 0 | std::string result; |
3169 | | |
3170 | | // check all prefixes (also crossed with suffixes if allowed) |
3171 | 0 | std::string st = prefix_check_morph(word, start, len, in_compound); |
3172 | 0 | if (!st.empty()) { |
3173 | 0 | result.append(st); |
3174 | 0 | } |
3175 | | |
3176 | | // if still not found check all suffixes |
3177 | 0 | st = suffix_check_morph(word, start, len, 0, NULL, '\0', needflag, in_compound); |
3178 | 0 | if (!st.empty()) { |
3179 | 0 | result.append(st); |
3180 | 0 | } |
3181 | |
|
3182 | 0 | if (havecontclass) { |
3183 | 0 | sfx = NULL; |
3184 | 0 | pfx = NULL; |
3185 | | // if still not found check all two-level suffixes |
3186 | 0 | st = suffix_check_twosfx_morph(word, start, len, 0, NULL, needflag); |
3187 | 0 | if (!st.empty()) { |
3188 | 0 | result.append(st); |
3189 | 0 | } |
3190 | | |
3191 | | // if still not found check all two-level suffixes |
3192 | 0 | st = prefix_check_twosfx_morph(word, start, len, IN_CPD_NOT, needflag); |
3193 | 0 | if (!st.empty()) { |
3194 | 0 | result.append(st); |
3195 | 0 | } |
3196 | 0 | } |
3197 | |
|
3198 | 0 | return result; |
3199 | 0 | } |
3200 | | |
3201 | | // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields |
3202 | | // in the first line of the inputs |
3203 | | // return 0, if inputs equal |
3204 | | // return 1, if inputs may equal with a secondary suffix |
3205 | | // otherwise return -1 |
3206 | 0 | static int morphcmp(const char* s, const char* t) { |
3207 | 0 | int se = 0, te = 0; |
3208 | 0 | const char* sl; |
3209 | 0 | const char* tl; |
3210 | 0 | const char* olds; |
3211 | 0 | const char* oldt; |
3212 | 0 | if (!s || !t) |
3213 | 0 | return 1; |
3214 | 0 | olds = s; |
3215 | 0 | sl = strchr(s, '\n'); |
3216 | 0 | s = strstr(s, MORPH_DERI_SFX); |
3217 | 0 | if (!s || (sl && sl < s)) |
3218 | 0 | s = strstr(olds, MORPH_INFL_SFX); |
3219 | 0 | if (!s || (sl && sl < s)) { |
3220 | 0 | s = strstr(olds, MORPH_TERM_SFX); |
3221 | 0 | olds = NULL; |
3222 | 0 | } |
3223 | 0 | oldt = t; |
3224 | 0 | tl = strchr(t, '\n'); |
3225 | 0 | t = strstr(t, MORPH_DERI_SFX); |
3226 | 0 | if (!t || (tl && tl < t)) |
3227 | 0 | t = strstr(oldt, MORPH_INFL_SFX); |
3228 | 0 | if (!t || (tl && tl < t)) { |
3229 | 0 | t = strstr(oldt, MORPH_TERM_SFX); |
3230 | 0 | oldt = NULL; |
3231 | 0 | } |
3232 | 0 | while (s && t && (!sl || sl > s) && (!tl || tl > t)) { |
3233 | 0 | s += MORPH_TAG_LEN; |
3234 | 0 | t += MORPH_TAG_LEN; |
3235 | 0 | se = 0; |
3236 | 0 | te = 0; |
3237 | 0 | while ((*s == *t) && !se && !te) { |
3238 | 0 | s++; |
3239 | 0 | t++; |
3240 | 0 | switch (*s) { |
3241 | 0 | case ' ': |
3242 | 0 | case '\n': |
3243 | 0 | case '\t': |
3244 | 0 | case '\0': |
3245 | 0 | se = 1; |
3246 | 0 | } |
3247 | 0 | switch (*t) { |
3248 | 0 | case ' ': |
3249 | 0 | case '\n': |
3250 | 0 | case '\t': |
3251 | 0 | case '\0': |
3252 | 0 | te = 1; |
3253 | 0 | } |
3254 | 0 | } |
3255 | 0 | if (!se || !te) { |
3256 | | // not terminal suffix difference |
3257 | 0 | if (olds) |
3258 | 0 | return -1; |
3259 | 0 | return 1; |
3260 | 0 | } |
3261 | 0 | olds = s; |
3262 | 0 | s = strstr(s, MORPH_DERI_SFX); |
3263 | 0 | if (!s || (sl && sl < s)) |
3264 | 0 | s = strstr(olds, MORPH_INFL_SFX); |
3265 | 0 | if (!s || (sl && sl < s)) { |
3266 | 0 | s = strstr(olds, MORPH_TERM_SFX); |
3267 | 0 | olds = NULL; |
3268 | 0 | } |
3269 | 0 | oldt = t; |
3270 | 0 | t = strstr(t, MORPH_DERI_SFX); |
3271 | 0 | if (!t || (tl && tl < t)) |
3272 | 0 | t = strstr(oldt, MORPH_INFL_SFX); |
3273 | 0 | if (!t || (tl && tl < t)) { |
3274 | 0 | t = strstr(oldt, MORPH_TERM_SFX); |
3275 | 0 | oldt = NULL; |
3276 | 0 | } |
3277 | 0 | } |
3278 | 0 | if (!s && !t && se && te) |
3279 | 0 | return 0; |
3280 | 0 | return 1; |
3281 | 0 | } |
3282 | | |
3283 | | std::string AffixMgr::morphgen(const char* ts, |
3284 | | int wl, |
3285 | | const unsigned short* ap, |
3286 | | unsigned short al, |
3287 | | const char* morph, |
3288 | | const char* targetmorph, |
3289 | 0 | int level) { |
3290 | | // handle suffixes |
3291 | 0 | if (!morph) |
3292 | 0 | return {}; |
3293 | | |
3294 | | // check substandard flag |
3295 | 0 | if (TESTAFF(ap, substandard, al)) |
3296 | 0 | return {}; |
3297 | | |
3298 | 0 | if (morphcmp(morph, targetmorph) == 0) |
3299 | 0 | return ts; |
3300 | | |
3301 | 0 | size_t stemmorphcatpos; |
3302 | 0 | std::string mymorph; |
3303 | | |
3304 | | // use input suffix fields, if exist |
3305 | 0 | if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { |
3306 | 0 | mymorph.assign(morph); |
3307 | 0 | mymorph.push_back(MSEP_FLD); |
3308 | 0 | stemmorphcatpos = mymorph.size(); |
3309 | 0 | } else { |
3310 | 0 | stemmorphcatpos = std::string::npos; |
3311 | 0 | } |
3312 | |
|
3313 | 0 | for (int i = 0; i < al; i++) { |
3314 | 0 | const auto c = (unsigned char)(ap[i] & 0x00FF); |
3315 | 0 | SfxEntry* sptr = sFlag[c]; |
3316 | 0 | while (sptr) { |
3317 | 0 | if (sptr->getFlag() == ap[i] && sptr->getMorph() && |
3318 | 0 | ((sptr->getContLen() == 0) || |
3319 | | // don't generate forms with substandard affixes |
3320 | 0 | !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { |
3321 | 0 | const char* stemmorph; |
3322 | 0 | if (stemmorphcatpos != std::string::npos) { |
3323 | 0 | mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph()); |
3324 | 0 | stemmorph = mymorph.c_str(); |
3325 | 0 | } else { |
3326 | 0 | stemmorph = sptr->getMorph(); |
3327 | 0 | } |
3328 | |
|
3329 | 0 | int cmp = morphcmp(stemmorph, targetmorph); |
3330 | |
|
3331 | 0 | if (cmp == 0) { |
3332 | 0 | std::string newword = sptr->add(ts, wl); |
3333 | 0 | if (!newword.empty()) { |
3334 | 0 | hentry* check = pHMgr->lookup(newword.c_str(), newword.size()); // XXX extra dic |
3335 | 0 | if (!check || !check->astr || |
3336 | 0 | !(TESTAFF(check->astr, forbiddenword, check->alen) || |
3337 | 0 | TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { |
3338 | 0 | return newword; |
3339 | 0 | } |
3340 | 0 | } |
3341 | 0 | } |
3342 | | |
3343 | | // recursive call for secondary suffixes |
3344 | 0 | if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && |
3345 | 0 | !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { |
3346 | 0 | std::string newword = sptr->add(ts, wl); |
3347 | 0 | if (!newword.empty()) { |
3348 | 0 | std::string newword2 = |
3349 | 0 | morphgen(newword.c_str(), newword.size(), sptr->getCont(), |
3350 | 0 | sptr->getContLen(), stemmorph, targetmorph, 1); |
3351 | |
|
3352 | 0 | if (!newword2.empty()) { |
3353 | 0 | return newword2; |
3354 | 0 | } |
3355 | 0 | } |
3356 | 0 | } |
3357 | 0 | } |
3358 | 0 | sptr = sptr->getFlgNxt(); |
3359 | 0 | } |
3360 | 0 | } |
3361 | 0 | return { }; |
3362 | 0 | } |
3363 | | |
3364 | | namespace { |
3365 | | // replaces strdup with ansi version |
3366 | 1.31M | char* mystrdup(const char* s) { |
3367 | 1.31M | char* d = NULL; |
3368 | 1.31M | if (s) { |
3369 | 1.31M | size_t sl = strlen(s) + 1; |
3370 | 1.31M | d = new char[sl]; |
3371 | 1.31M | memcpy(d, s, sl); |
3372 | 1.31M | } |
3373 | 1.31M | return d; |
3374 | 1.31M | } |
3375 | | } |
3376 | | |
3377 | | int AffixMgr::expand_rootword(struct guessword* wlst, |
3378 | | int maxn, |
3379 | | const char* ts, |
3380 | | int wl, |
3381 | | const unsigned short* ap, |
3382 | | unsigned short al, |
3383 | | const char* bad, |
3384 | | int badl, |
3385 | 1.06M | const char* phon) { |
3386 | 1.06M | int nh = 0; |
3387 | | // first add root word to list |
3388 | 1.06M | if ((nh < maxn) && |
3389 | 1.06M | !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || |
3390 | 1.06M | (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { |
3391 | 1.06M | wlst[nh].word = mystrdup(ts); |
3392 | 1.06M | wlst[nh].allow = false; |
3393 | 1.06M | wlst[nh].orig = NULL; |
3394 | 1.06M | nh++; |
3395 | | // add special phonetic version |
3396 | 1.06M | if (phon && (nh < maxn)) { |
3397 | 78.9k | wlst[nh].word = mystrdup(phon); |
3398 | 78.9k | wlst[nh].allow = false; |
3399 | 78.9k | wlst[nh].orig = mystrdup(ts); |
3400 | 78.9k | nh++; |
3401 | 78.9k | } |
3402 | 1.06M | } |
3403 | | |
3404 | | // handle suffixes |
3405 | 1.97M | for (int i = 0; i < al; i++) { |
3406 | 910k | const auto c = (unsigned char)(ap[i] & 0x00FF); |
3407 | 910k | SfxEntry* sptr = sFlag[c]; |
3408 | 1.05M | while (sptr) { |
3409 | 140k | if ((sptr->getFlag() == ap[i]) && |
3410 | 140k | (!sptr->getKeyLen() || |
3411 | 135k | ((badl > sptr->getKeyLen()) && |
3412 | 64.9k | (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && |
3413 | | // check needaffix flag |
3414 | 140k | !(sptr->getCont() && |
3415 | 72.0k | ((needaffix && |
3416 | 61.2k | TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || |
3417 | 61.2k | (circumfix && |
3418 | 60.8k | TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || |
3419 | 61.2k | (onlyincompound && |
3420 | 70.2k | TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) { |
3421 | 70.2k | std::string newword = sptr->add(ts, wl); |
3422 | 70.2k | if (!newword.empty()) { |
3423 | 27.1k | if (nh < maxn) { |
3424 | 26.7k | wlst[nh].word = mystrdup(newword.c_str()); |
3425 | 26.7k | wlst[nh].allow = sptr->allowCross(); |
3426 | 26.7k | wlst[nh].orig = NULL; |
3427 | 26.7k | nh++; |
3428 | | // add special phonetic version |
3429 | 26.7k | if (phon && (nh < maxn)) { |
3430 | 4.07k | std::string prefix(phon); |
3431 | 4.07k | std::string key(sptr->getKey()); |
3432 | 4.07k | reverseword(key); |
3433 | 4.07k | prefix.append(key); |
3434 | 4.07k | wlst[nh].word = mystrdup(prefix.c_str()); |
3435 | 4.07k | wlst[nh].allow = false; |
3436 | 4.07k | wlst[nh].orig = mystrdup(newword.c_str()); |
3437 | 4.07k | nh++; |
3438 | 4.07k | } |
3439 | 26.7k | } |
3440 | 27.1k | } |
3441 | 70.2k | } |
3442 | 140k | sptr = sptr->getFlgNxt(); |
3443 | 140k | } |
3444 | 910k | } |
3445 | | |
3446 | 1.06M | int n = nh; |
3447 | | |
3448 | | // handle cross products of prefixes and suffixes |
3449 | 1.17M | for (int j = 1; j < n; j++) |
3450 | 109k | if (wlst[j].allow) { |
3451 | 271k | for (int k = 0; k < al; k++) { |
3452 | 259k | const auto c = (unsigned char)(ap[k] & 0x00FF); |
3453 | 259k | PfxEntry* cptr = pFlag[c]; |
3454 | 428k | while (cptr) { |
3455 | 169k | if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && |
3456 | 169k | (!cptr->getKeyLen() || |
3457 | 64.4k | ((badl > cptr->getKeyLen()) && |
3458 | 57.5k | (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { |
3459 | 57.5k | int l1 = strlen(wlst[j].word); |
3460 | 57.5k | std::string newword = cptr->add(wlst[j].word, l1); |
3461 | 57.5k | if (!newword.empty()) { |
3462 | 27.4k | if (nh < maxn) { |
3463 | 8.13k | wlst[nh].word = mystrdup(newword.c_str()); |
3464 | 8.13k | wlst[nh].allow = cptr->allowCross(); |
3465 | 8.13k | wlst[nh].orig = NULL; |
3466 | 8.13k | nh++; |
3467 | 8.13k | } |
3468 | 27.4k | } |
3469 | 57.5k | } |
3470 | 169k | cptr = cptr->getFlgNxt(); |
3471 | 169k | } |
3472 | 259k | } |
3473 | 12.0k | } |
3474 | | |
3475 | | // now handle pure prefixes |
3476 | 1.97M | for (int m = 0; m < al; m++) { |
3477 | 910k | const auto c = (unsigned char)(ap[m] & 0x00FF); |
3478 | 910k | PfxEntry* ptr = pFlag[c]; |
3479 | 1.08M | while (ptr) { |
3480 | 172k | if ((ptr->getFlag() == ap[m]) && |
3481 | 172k | (!ptr->getKeyLen() || |
3482 | 167k | ((badl > ptr->getKeyLen()) && |
3483 | 98.1k | (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && |
3484 | | // check needaffix flag |
3485 | 172k | !(ptr->getCont() && |
3486 | 70.1k | ((needaffix && |
3487 | 64.0k | TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || |
3488 | 64.0k | (circumfix && |
3489 | 63.3k | TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || |
3490 | 64.0k | (onlyincompound && |
3491 | 67.0k | TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) { |
3492 | 67.0k | std::string newword = ptr->add(ts, wl); |
3493 | 67.0k | if (!newword.empty()) { |
3494 | 27.5k | if (nh < maxn) { |
3495 | 26.6k | wlst[nh].word = mystrdup(newword.c_str()); |
3496 | 26.6k | wlst[nh].allow = ptr->allowCross(); |
3497 | 26.6k | wlst[nh].orig = NULL; |
3498 | 26.6k | nh++; |
3499 | 26.6k | } |
3500 | 27.5k | } |
3501 | 67.0k | } |
3502 | 172k | ptr = ptr->getFlgNxt(); |
3503 | 172k | } |
3504 | 910k | } |
3505 | | |
3506 | 1.06M | return nh; |
3507 | 1.06M | } |
3508 | | |
3509 | | // return replacing table |
3510 | 647k | const std::vector<replentry>& AffixMgr::get_reptable() const { |
3511 | 647k | return pHMgr->get_reptable(); |
3512 | 647k | } |
3513 | | |
3514 | | // return iconv table |
3515 | 3.09M | RepList* AffixMgr::get_iconvtable() const { |
3516 | 3.09M | if (!iconvtable) |
3517 | 2.76M | return NULL; |
3518 | 326k | return iconvtable; |
3519 | 3.09M | } |
3520 | | |
3521 | | // return oconv table |
3522 | 108k | RepList* AffixMgr::get_oconvtable() const { |
3523 | 108k | if (!oconvtable) |
3524 | 106k | return NULL; |
3525 | 2.15k | return oconvtable; |
3526 | 108k | } |
3527 | | |
3528 | | // return replacing table |
3529 | 105k | struct phonetable* AffixMgr::get_phonetable() const { |
3530 | 105k | if (!phone) |
3531 | 73.2k | return NULL; |
3532 | 31.8k | return phone; |
3533 | 105k | } |
3534 | | |
3535 | | // return character map table |
3536 | 407k | const std::vector<mapentry>& AffixMgr::get_maptable() const { |
3537 | 407k | return maptable; |
3538 | 407k | } |
3539 | | |
3540 | | // return character map table |
3541 | 18.3k | const std::vector<std::string>& AffixMgr::get_breaktable() const { |
3542 | 18.3k | return breaktable; |
3543 | 18.3k | } |
3544 | | |
3545 | | // return text encoding of dictionary |
3546 | 46.2k | const std::string& AffixMgr::get_encoding() { |
3547 | 46.2k | if (encoding.empty()) |
3548 | 13.0k | encoding = SPELL_ENCODING; |
3549 | 46.2k | return encoding; |
3550 | 46.2k | } |
3551 | | |
3552 | | // return text encoding of dictionary |
3553 | 36.7k | int AffixMgr::get_langnum() const { |
3554 | 36.7k | return langnum; |
3555 | 36.7k | } |
3556 | | |
3557 | | // return double prefix option |
3558 | 36.7k | int AffixMgr::get_complexprefixes() const { |
3559 | 36.7k | return complexprefixes; |
3560 | 36.7k | } |
3561 | | |
3562 | | // return FULLSTRIP option |
3563 | 5.72M | int AffixMgr::get_fullstrip() const { |
3564 | 5.72M | return fullstrip; |
3565 | 5.72M | } |
3566 | | |
3567 | 153k | FLAG AffixMgr::get_keepcase() const { |
3568 | 153k | return keepcase; |
3569 | 153k | } |
3570 | | |
3571 | 48.5k | FLAG AffixMgr::get_forceucase() const { |
3572 | 48.5k | return forceucase; |
3573 | 48.5k | } |
3574 | | |
3575 | 280k | FLAG AffixMgr::get_warn() const { |
3576 | 280k | return warn; |
3577 | 280k | } |
3578 | | |
3579 | 10.5k | int AffixMgr::get_forbidwarn() const { |
3580 | 10.5k | return forbidwarn; |
3581 | 10.5k | } |
3582 | | |
3583 | 173k | int AffixMgr::get_checksharps() const { |
3584 | 173k | return checksharps; |
3585 | 173k | } |
3586 | | |
3587 | 0 | std::string AffixMgr::encode_flag(unsigned short aflag) const { |
3588 | 0 | return pHMgr->encode_flag(aflag); |
3589 | 0 | } |
3590 | | |
3591 | | // return the preferred ignore string for suggestions |
3592 | 7.88M | const char* AffixMgr::get_ignore() const { |
3593 | 7.88M | if (ignorechars.empty()) |
3594 | 7.83M | return NULL; |
3595 | 53.9k | return ignorechars.c_str(); |
3596 | 7.88M | } |
3597 | | |
3598 | | // return the preferred ignore string for suggestions |
3599 | 27.7k | const std::vector<w_char>& AffixMgr::get_ignore_utf16() const { |
3600 | 27.7k | return ignorechars_utf16; |
3601 | 27.7k | } |
3602 | | |
3603 | | // return the keyboard string for suggestions |
3604 | 18.3k | const std::string& AffixMgr::get_key_string() { |
3605 | 18.3k | if (keystring.empty()) |
3606 | 18.1k | keystring = SPELL_KEYSTRING; |
3607 | 18.3k | return keystring; |
3608 | 18.3k | } |
3609 | | |
3610 | | // return the preferred try string for suggestions |
3611 | 18.3k | const std::string& AffixMgr::get_try_string() const { |
3612 | 18.3k | return trystring; |
3613 | 18.3k | } |
3614 | | |
3615 | | // return the preferred try string for suggestions |
3616 | 0 | const std::string& AffixMgr::get_wordchars() const { |
3617 | 0 | return wordchars; |
3618 | 0 | } |
3619 | | |
3620 | 0 | const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const { |
3621 | 0 | return wordchars_utf16; |
3622 | 0 | } |
3623 | | |
3624 | | // is there compounding? |
3625 | 331M | int AffixMgr::get_compound() const { |
3626 | 331M | return compoundflag || compoundbegin || !defcpdtable.empty(); |
3627 | 331M | } |
3628 | | |
3629 | | // return the compound words control flag |
3630 | 591k | FLAG AffixMgr::get_compoundflag() const { |
3631 | 591k | return compoundflag; |
3632 | 591k | } |
3633 | | |
3634 | | // return the forbidden words control flag |
3635 | 1.85M | FLAG AffixMgr::get_forbiddenword() const { |
3636 | 1.85M | return forbiddenword; |
3637 | 1.85M | } |
3638 | | |
3639 | | // return the forbidden words control flag |
3640 | 449k | FLAG AffixMgr::get_nosuggest() const { |
3641 | 449k | return nosuggest; |
3642 | 449k | } |
3643 | | |
3644 | | // return the forbidden words control flag |
3645 | 105k | FLAG AffixMgr::get_nongramsuggest() const { |
3646 | 105k | return nongramsuggest; |
3647 | 105k | } |
3648 | | |
3649 | | // return the substandard root/affix control flag |
3650 | 51.2k | FLAG AffixMgr::get_substandard() const { |
3651 | 51.2k | return substandard; |
3652 | 51.2k | } |
3653 | | |
3654 | | // return the forbidden words flag modify flag |
3655 | 14.1M | FLAG AffixMgr::get_needaffix() const { |
3656 | 14.1M | return needaffix; |
3657 | 14.1M | } |
3658 | | |
3659 | | // return the onlyincompound flag |
3660 | 320k | FLAG AffixMgr::get_onlyincompound() const { |
3661 | 320k | return onlyincompound; |
3662 | 320k | } |
3663 | | |
3664 | | // return the value of suffix |
3665 | 0 | const std::string& AffixMgr::get_version() const { |
3666 | 0 | return version; |
3667 | 0 | } |
3668 | | |
3669 | | // utility method to look up root words in hash table |
3670 | 2.09G | struct hentry* AffixMgr::lookup(const char* word, size_t len) { |
3671 | 2.09G | struct hentry* he = NULL; |
3672 | 4.18G | for (size_t i = 0; i < alldic.size() && !he; ++i) { |
3673 | 2.09G | he = alldic[i]->lookup(word, len); |
3674 | 2.09G | } |
3675 | 2.09G | return he; |
3676 | 2.09G | } |
3677 | | |
3678 | | // return the value of suffix |
3679 | 214M | int AffixMgr::have_contclass() const { |
3680 | 214M | return havecontclass; |
3681 | 214M | } |
3682 | | |
3683 | | // return utf8 |
3684 | 36.7k | int AffixMgr::get_utf8() const { |
3685 | 36.7k | return utf8; |
3686 | 36.7k | } |
3687 | | |
3688 | 124k | int AffixMgr::get_maxngramsugs(void) const { |
3689 | 124k | return maxngramsugs; |
3690 | 124k | } |
3691 | | |
3692 | 18.4k | int AffixMgr::get_maxcpdsugs(void) const { |
3693 | 18.4k | return maxcpdsugs; |
3694 | 18.4k | } |
3695 | | |
3696 | 56.4k | int AffixMgr::get_maxdiff(void) const { |
3697 | 56.4k | return maxdiff; |
3698 | 56.4k | } |
3699 | | |
3700 | 17.3k | int AffixMgr::get_onlymaxdiff(void) const { |
3701 | 17.3k | return onlymaxdiff; |
3702 | 17.3k | } |
3703 | | |
3704 | | // return nosplitsugs |
3705 | 18.3k | int AffixMgr::get_nosplitsugs(void) const { |
3706 | 18.3k | return nosplitsugs; |
3707 | 18.3k | } |
3708 | | |
3709 | | // return sugswithdots |
3710 | 25.1k | int AffixMgr::get_sugswithdots(void) const { |
3711 | 25.1k | return sugswithdots; |
3712 | 25.1k | } |
3713 | | |
3714 | | /* parse flag */ |
3715 | 13.1k | bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) { |
3716 | 13.1k | if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { |
3717 | 255 | HUNSPELL_WARNING( |
3718 | 255 | stderr, |
3719 | 255 | "error: line %d: multiple definitions of an affix file parameter\n", |
3720 | 255 | af->getlinenum()); |
3721 | 255 | return false; |
3722 | 255 | } |
3723 | 12.8k | std::string s; |
3724 | 12.8k | if (!parse_string(line, s, af->getlinenum())) |
3725 | 128 | return false; |
3726 | 12.7k | *out = pHMgr->decode_flag(s); |
3727 | 12.7k | return true; |
3728 | 12.8k | } |
3729 | | |
3730 | | /* parse num */ |
3731 | 2.31k | bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) { |
3732 | 2.31k | if (*out != -1) { |
3733 | 52 | HUNSPELL_WARNING( |
3734 | 52 | stderr, |
3735 | 52 | "error: line %d: multiple definitions of an affix file parameter\n", |
3736 | 52 | af->getlinenum()); |
3737 | 52 | return false; |
3738 | 52 | } |
3739 | 2.26k | std::string s; |
3740 | 2.26k | if (!parse_string(line, s, af->getlinenum())) |
3741 | 25 | return false; |
3742 | 2.23k | *out = atoi(s.c_str()); |
3743 | 2.23k | return true; |
3744 | 2.26k | } |
3745 | | |
3746 | | /* parse in the max syllablecount of compound words and */ |
3747 | 51.4k | bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) { |
3748 | 51.4k | int i = 0; |
3749 | 51.4k | int np = 0; |
3750 | 51.4k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
3751 | 235k | while (start_piece != line.end()) { |
3752 | 183k | switch (i) { |
3753 | 51.4k | case 0: { |
3754 | 51.4k | np++; |
3755 | 51.4k | break; |
3756 | 0 | } |
3757 | 51.4k | case 1: { |
3758 | 51.4k | cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str()); |
3759 | 51.4k | np++; |
3760 | 51.4k | break; |
3761 | 0 | } |
3762 | 49.7k | case 2: { |
3763 | 49.7k | if (!utf8) { |
3764 | 3.42k | cpdvowels.assign(start_piece, iter); |
3765 | 3.42k | std::sort(cpdvowels.begin(), cpdvowels.end()); |
3766 | 46.3k | } else { |
3767 | 46.3k | std::string piece(start_piece, iter); |
3768 | 46.3k | u8_u16(cpdvowels_utf16, piece); |
3769 | 46.3k | std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end()); |
3770 | 46.3k | } |
3771 | 49.7k | np++; |
3772 | 49.7k | break; |
3773 | 0 | } |
3774 | 31.1k | default: |
3775 | 31.1k | break; |
3776 | 183k | } |
3777 | 183k | ++i; |
3778 | 183k | start_piece = mystrsep(line, iter); |
3779 | 183k | } |
3780 | 51.4k | if (np < 2) { |
3781 | 7 | HUNSPELL_WARNING(stderr, |
3782 | 7 | "error: line %d: missing compoundsyllable information\n", |
3783 | 7 | af->getlinenum()); |
3784 | 7 | return false; |
3785 | 7 | } |
3786 | 51.4k | if (np == 2) |
3787 | 1.73k | cpdvowels = "AEIOUaeiou"; |
3788 | 51.4k | return true; |
3789 | 51.4k | } |
3790 | | |
3791 | | bool AffixMgr::parse_convtable(const std::string& line, |
3792 | | FileMgr* af, |
3793 | | RepList** rl, |
3794 | 2.55k | const std::string& keyword) { |
3795 | 2.55k | if (*rl) { |
3796 | 107 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
3797 | 107 | af->getlinenum()); |
3798 | 107 | return false; |
3799 | 107 | } |
3800 | 2.44k | int i = 0; |
3801 | 2.44k | int np = 0; |
3802 | 2.44k | int numrl = 0; |
3803 | 2.44k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
3804 | 10.8k | while (start_piece != line.end()) { |
3805 | 8.47k | switch (i) { |
3806 | 2.44k | case 0: { |
3807 | 2.44k | np++; |
3808 | 2.44k | break; |
3809 | 0 | } |
3810 | 2.42k | case 1: { |
3811 | 2.42k | numrl = atoi(std::string(start_piece, iter).c_str()); |
3812 | 2.42k | if (numrl < 1) { |
3813 | 93 | HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", |
3814 | 93 | af->getlinenum()); |
3815 | 93 | return false; |
3816 | 93 | } |
3817 | 2.33k | *rl = new RepList(numrl); |
3818 | 2.33k | if (!*rl) |
3819 | 0 | return false; |
3820 | 2.33k | np++; |
3821 | 2.33k | break; |
3822 | 2.33k | } |
3823 | 3.60k | default: |
3824 | 3.60k | break; |
3825 | 8.47k | } |
3826 | 8.38k | ++i; |
3827 | 8.38k | start_piece = mystrsep(line, iter); |
3828 | 8.38k | } |
3829 | 2.35k | if (np != 2) { |
3830 | 25 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
3831 | 25 | af->getlinenum()); |
3832 | 25 | return false; |
3833 | 25 | } |
3834 | | |
3835 | | /* now parse the num lines to read in the remainder of the table */ |
3836 | 12.5k | for (int j = 0; j < numrl; j++) { |
3837 | 11.0k | std::string nl; |
3838 | 11.0k | if (!af->getline(nl)) |
3839 | 518 | return false; |
3840 | 10.4k | mychomp(nl); |
3841 | 10.4k | i = 0; |
3842 | 10.4k | std::string pattern; |
3843 | 10.4k | std::string pattern2; |
3844 | 10.4k | iter = nl.begin(); |
3845 | 10.4k | start_piece = mystrsep(nl, iter); |
3846 | 50.2k | while (start_piece != nl.end()) { |
3847 | 39.8k | { |
3848 | 39.8k | switch (i) { |
3849 | 10.4k | case 0: { |
3850 | 10.4k | if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) { |
3851 | 148 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
3852 | 148 | af->getlinenum()); |
3853 | 148 | delete *rl; |
3854 | 148 | *rl = NULL; |
3855 | 148 | return false; |
3856 | 148 | } |
3857 | 10.3k | break; |
3858 | 10.4k | } |
3859 | 10.3k | case 1: { |
3860 | 10.3k | pattern.assign(start_piece, iter); |
3861 | 10.3k | break; |
3862 | 10.4k | } |
3863 | 10.2k | case 2: { |
3864 | 10.2k | pattern2.assign(start_piece, iter); |
3865 | 10.2k | break; |
3866 | 10.4k | } |
3867 | 8.86k | default: |
3868 | 8.86k | break; |
3869 | 39.8k | } |
3870 | 39.7k | ++i; |
3871 | 39.7k | } |
3872 | 0 | start_piece = mystrsep(nl, iter); |
3873 | 39.7k | } |
3874 | 10.3k | if (pattern.empty() || pattern2.empty()) { |
3875 | 105 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
3876 | 105 | af->getlinenum()); |
3877 | 105 | return false; |
3878 | 105 | } |
3879 | | |
3880 | 10.2k | (*rl)->add(pattern, pattern2); |
3881 | 10.2k | } |
3882 | 1.55k | return true; |
3883 | 2.33k | } |
3884 | | |
3885 | | /* parse in the typical fault correcting table */ |
3886 | 1.33k | bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) { |
3887 | 1.33k | if (phone) { |
3888 | 50 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
3889 | 50 | af->getlinenum()); |
3890 | 50 | return false; |
3891 | 50 | } |
3892 | 1.28k | std::unique_ptr<phonetable> new_phone; |
3893 | 1.28k | int num = -1; |
3894 | 1.28k | int i = 0; |
3895 | 1.28k | int np = 0; |
3896 | 1.28k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
3897 | 4.74k | while (start_piece != line.end()) { |
3898 | 3.52k | switch (i) { |
3899 | 1.28k | case 0: { |
3900 | 1.28k | np++; |
3901 | 1.28k | break; |
3902 | 0 | } |
3903 | 1.26k | case 1: { |
3904 | 1.26k | num = atoi(std::string(start_piece, iter).c_str()); |
3905 | 1.26k | if (num < 1) { |
3906 | 67 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
3907 | 67 | af->getlinenum()); |
3908 | 67 | return false; |
3909 | 67 | } |
3910 | 1.19k | new_phone.reset(new phonetable); |
3911 | 1.19k | new_phone->utf8 = (char)utf8; |
3912 | 1.19k | np++; |
3913 | 1.19k | break; |
3914 | 1.26k | } |
3915 | 979 | default: |
3916 | 979 | break; |
3917 | 3.52k | } |
3918 | 3.46k | ++i; |
3919 | 3.46k | start_piece = mystrsep(line, iter); |
3920 | 3.46k | } |
3921 | 1.21k | if (np != 2) { |
3922 | 18 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
3923 | 18 | af->getlinenum()); |
3924 | 18 | return false; |
3925 | 18 | } |
3926 | | |
3927 | | /* now parse the phone->num lines to read in the remainder of the table */ |
3928 | 3.44k | for (int j = 0; j < num; ++j) { |
3929 | 2.43k | std::string nl; |
3930 | 2.43k | if (!af->getline(nl)) |
3931 | 67 | return false; |
3932 | 2.36k | mychomp(nl); |
3933 | 2.36k | i = 0; |
3934 | 2.36k | const size_t old_size = new_phone->rules.size(); |
3935 | 2.36k | iter = nl.begin(); |
3936 | 2.36k | start_piece = mystrsep(nl, iter); |
3937 | 11.6k | while (start_piece != nl.end()) { |
3938 | 9.38k | { |
3939 | 9.38k | switch (i) { |
3940 | 2.36k | case 0: { |
3941 | 2.36k | if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) { |
3942 | 69 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
3943 | 69 | af->getlinenum()); |
3944 | 69 | return false; |
3945 | 69 | } |
3946 | 2.29k | break; |
3947 | 2.36k | } |
3948 | 2.29k | case 1: { |
3949 | 2.28k | new_phone->rules.emplace_back(start_piece, iter); |
3950 | 2.28k | break; |
3951 | 2.36k | } |
3952 | 2.24k | case 2: { |
3953 | 2.24k | new_phone->rules.emplace_back(start_piece, iter); |
3954 | 2.24k | mystrrep(new_phone->rules.back(), "_", ""); |
3955 | 2.24k | break; |
3956 | 2.36k | } |
3957 | 2.48k | default: |
3958 | 2.48k | break; |
3959 | 9.38k | } |
3960 | 9.31k | ++i; |
3961 | 9.31k | } |
3962 | 0 | start_piece = mystrsep(nl, iter); |
3963 | 9.31k | } |
3964 | 2.29k | if (new_phone->rules.size() != old_size + 2) { |
3965 | 49 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
3966 | 49 | af->getlinenum()); |
3967 | 49 | return false; |
3968 | 49 | } |
3969 | 2.29k | } |
3970 | 1.01k | new_phone->rules.emplace_back(""); |
3971 | 1.01k | new_phone->rules.emplace_back(""); |
3972 | 1.01k | init_phonet_hash(*new_phone); |
3973 | 1.01k | phone = new_phone.release(); |
3974 | 1.01k | return true; |
3975 | 1.19k | } |
3976 | | |
3977 | | /* parse in the checkcompoundpattern table */ |
3978 | 869 | bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) { |
3979 | 869 | if (parsedcheckcpd) { |
3980 | 4 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
3981 | 4 | af->getlinenum()); |
3982 | 4 | return false; |
3983 | 4 | } |
3984 | 865 | parsedcheckcpd = true; |
3985 | 865 | int numcheckcpd = -1; |
3986 | 865 | int i = 0; |
3987 | 865 | int np = 0; |
3988 | 865 | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
3989 | 3.72k | while (start_piece != line.end()) { |
3990 | 2.90k | switch (i) { |
3991 | 865 | case 0: { |
3992 | 865 | np++; |
3993 | 865 | break; |
3994 | 0 | } |
3995 | 863 | case 1: { |
3996 | 863 | numcheckcpd = atoi(std::string(start_piece, iter).c_str()); |
3997 | 863 | if (numcheckcpd < 1) { |
3998 | 45 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
3999 | 45 | af->getlinenum()); |
4000 | 45 | return false; |
4001 | 45 | } |
4002 | 818 | checkcpdtable.reserve(std::min(numcheckcpd, 16384)); |
4003 | 818 | np++; |
4004 | 818 | break; |
4005 | 863 | } |
4006 | 1.18k | default: |
4007 | 1.18k | break; |
4008 | 2.90k | } |
4009 | 2.86k | ++i; |
4010 | 2.86k | start_piece = mystrsep(line, iter); |
4011 | 2.86k | } |
4012 | 820 | if (np != 2) { |
4013 | 2 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
4014 | 2 | af->getlinenum()); |
4015 | 2 | return false; |
4016 | 2 | } |
4017 | | |
4018 | | /* now parse the numcheckcpd lines to read in the remainder of the table */ |
4019 | 9.30k | for (int j = 0; j < numcheckcpd; ++j) { |
4020 | 9.23k | std::string nl; |
4021 | 9.23k | if (!af->getline(nl)) |
4022 | 582 | return false; |
4023 | 8.65k | mychomp(nl); |
4024 | 8.65k | i = 0; |
4025 | 8.65k | checkcpdtable.emplace_back(); |
4026 | 8.65k | iter = nl.begin(); |
4027 | 8.65k | start_piece = mystrsep(nl, iter); |
4028 | 35.2k | while (start_piece != nl.end()) { |
4029 | 26.7k | switch (i) { |
4030 | 8.06k | case 0: { |
4031 | 8.06k | if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) { |
4032 | 169 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4033 | 169 | af->getlinenum()); |
4034 | 169 | checkcpdtable.clear(); |
4035 | 169 | return false; |
4036 | 169 | } |
4037 | 7.89k | break; |
4038 | 8.06k | } |
4039 | 7.89k | case 1: { |
4040 | 6.84k | checkcpdtable.back().pattern.assign(start_piece, iter); |
4041 | 6.84k | size_t slash_pos = checkcpdtable.back().pattern.find('/'); |
4042 | 6.84k | if (slash_pos != std::string::npos) { |
4043 | 5.23k | std::string chunk(checkcpdtable.back().pattern, slash_pos + 1); |
4044 | 5.23k | checkcpdtable.back().pattern.resize(slash_pos); |
4045 | 5.23k | checkcpdtable.back().cond = pHMgr->decode_flag(chunk); |
4046 | 5.23k | } |
4047 | 6.84k | break; |
4048 | 8.06k | } |
4049 | 5.38k | case 2: { |
4050 | 5.38k | checkcpdtable.back().pattern2.assign(start_piece, iter); |
4051 | 5.38k | size_t slash_pos = checkcpdtable.back().pattern2.find('/'); |
4052 | 5.38k | if (slash_pos != std::string::npos) { |
4053 | 4.35k | std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1); |
4054 | 4.35k | checkcpdtable.back().pattern2.resize(slash_pos); |
4055 | 4.35k | checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk); |
4056 | 4.35k | } |
4057 | 5.38k | break; |
4058 | 8.06k | } |
4059 | 3.37k | case 3: { |
4060 | 3.37k | checkcpdtable.back().pattern3.assign(start_piece, iter); |
4061 | 3.37k | simplifiedcpd = 1; |
4062 | 3.37k | break; |
4063 | 8.06k | } |
4064 | 3.10k | default: |
4065 | 3.10k | break; |
4066 | 26.7k | } |
4067 | 26.6k | i++; |
4068 | 26.6k | start_piece = mystrsep(nl, iter); |
4069 | 26.6k | } |
4070 | 8.65k | } |
4071 | 67 | return true; |
4072 | 818 | } |
4073 | | |
4074 | | /* parse in the compound rule table */ |
4075 | 933 | bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) { |
4076 | 933 | if (parseddefcpd) { |
4077 | 4 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
4078 | 4 | af->getlinenum()); |
4079 | 4 | return false; |
4080 | 4 | } |
4081 | 929 | parseddefcpd = true; |
4082 | 929 | int numdefcpd = -1; |
4083 | 929 | int i = 0; |
4084 | 929 | int np = 0; |
4085 | 929 | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
4086 | 3.38k | while (start_piece != line.end()) { |
4087 | 2.50k | switch (i) { |
4088 | 929 | case 0: { |
4089 | 929 | np++; |
4090 | 929 | break; |
4091 | 0 | } |
4092 | 916 | case 1: { |
4093 | 916 | numdefcpd = atoi(std::string(start_piece, iter).c_str()); |
4094 | 916 | if (numdefcpd < 1) { |
4095 | 46 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
4096 | 46 | af->getlinenum()); |
4097 | 46 | return false; |
4098 | 46 | } |
4099 | 870 | defcpdtable.reserve(std::min(numdefcpd, 16384)); |
4100 | 870 | np++; |
4101 | 870 | break; |
4102 | 916 | } |
4103 | 656 | default: |
4104 | 656 | break; |
4105 | 2.50k | } |
4106 | 2.45k | ++i; |
4107 | 2.45k | start_piece = mystrsep(line, iter); |
4108 | 2.45k | } |
4109 | 883 | if (np != 2) { |
4110 | 13 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
4111 | 13 | af->getlinenum()); |
4112 | 13 | return false; |
4113 | 13 | } |
4114 | | |
4115 | | /* now parse the numdefcpd lines to read in the remainder of the table */ |
4116 | 5.54k | for (int j = 0; j < numdefcpd; ++j) { |
4117 | 5.47k | std::string nl; |
4118 | 5.47k | if (!af->getline(nl)) |
4119 | 390 | return false; |
4120 | 5.08k | mychomp(nl); |
4121 | 5.08k | i = 0; |
4122 | 5.08k | defcpdtable.emplace_back(); |
4123 | 5.08k | iter = nl.begin(); |
4124 | 5.08k | start_piece = mystrsep(nl, iter); |
4125 | 17.2k | while (start_piece != nl.end()) { |
4126 | 12.5k | switch (i) { |
4127 | 5.08k | case 0: { |
4128 | 5.08k | if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) { |
4129 | 378 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4130 | 378 | af->getlinenum()); |
4131 | 378 | numdefcpd = 0; |
4132 | 378 | return false; |
4133 | 378 | } |
4134 | 4.70k | break; |
4135 | 5.08k | } |
4136 | 4.70k | case 1: { // handle parenthesized flags |
4137 | 4.68k | if (std::find(start_piece, iter, '(') != iter) { |
4138 | 302k | for (auto k = start_piece; k != iter; ++k) { |
4139 | 302k | auto chb = k, che = k + 1; |
4140 | 302k | if (*k == '(') { |
4141 | 2.89k | auto parpos = std::find(k, iter, ')'); |
4142 | 2.89k | if (parpos != iter) { |
4143 | 1.15k | chb = k + 1; |
4144 | 1.15k | che = parpos; |
4145 | 1.15k | k = parpos; |
4146 | 1.15k | } |
4147 | 2.89k | } |
4148 | | |
4149 | 302k | if (*chb == '*' || *chb == '?') { |
4150 | 6.00k | defcpdtable.back().push_back((FLAG)*chb); |
4151 | 296k | } else { |
4152 | 296k | pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af); |
4153 | 296k | } |
4154 | 302k | } |
4155 | 3.99k | } else { |
4156 | 3.99k | pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af); |
4157 | 3.99k | } |
4158 | 4.68k | break; |
4159 | 5.08k | } |
4160 | 2.80k | default: |
4161 | 2.80k | break; |
4162 | 12.5k | } |
4163 | 12.2k | ++i; |
4164 | 12.2k | start_piece = mystrsep(nl, iter); |
4165 | 12.2k | } |
4166 | 4.71k | if (defcpdtable.back().empty()) { |
4167 | 32 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4168 | 32 | af->getlinenum()); |
4169 | 32 | return false; |
4170 | 32 | } |
4171 | 4.71k | } |
4172 | 70 | return true; |
4173 | 870 | } |
4174 | | |
4175 | | /* parse in the character map table */ |
4176 | 731 | bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) { |
4177 | 731 | if (parsedmaptable) { |
4178 | 18 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
4179 | 18 | af->getlinenum()); |
4180 | 18 | return false; |
4181 | 18 | } |
4182 | 713 | parsedmaptable = true; |
4183 | 713 | int nummap = -1; |
4184 | 713 | int i = 0; |
4185 | 713 | int np = 0; |
4186 | 713 | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
4187 | 3.04k | while (start_piece != line.end()) { |
4188 | 2.42k | switch (i) { |
4189 | 713 | case 0: { |
4190 | 713 | np++; |
4191 | 713 | break; |
4192 | 0 | } |
4193 | 705 | case 1: { |
4194 | 705 | nummap = atoi(std::string(start_piece, iter).c_str()); |
4195 | 705 | if (nummap < 1) { |
4196 | 90 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
4197 | 90 | af->getlinenum()); |
4198 | 90 | return false; |
4199 | 90 | } |
4200 | 615 | maptable.reserve(std::min(nummap, 16384)); |
4201 | 615 | np++; |
4202 | 615 | break; |
4203 | 705 | } |
4204 | 1.00k | default: |
4205 | 1.00k | break; |
4206 | 2.42k | } |
4207 | 2.33k | ++i; |
4208 | 2.33k | start_piece = mystrsep(line, iter); |
4209 | 2.33k | } |
4210 | 623 | if (np != 2) { |
4211 | 8 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
4212 | 8 | af->getlinenum()); |
4213 | 8 | return false; |
4214 | 8 | } |
4215 | | |
4216 | | /* now parse the nummap lines to read in the remainder of the table */ |
4217 | 2.92k | for (int j = 0; j < nummap; ++j) { |
4218 | 2.61k | std::string nl; |
4219 | 2.61k | if (!af->getline(nl)) |
4220 | 192 | return false; |
4221 | 2.41k | mychomp(nl); |
4222 | 2.41k | i = 0; |
4223 | 2.41k | maptable.emplace_back(); |
4224 | 2.41k | iter = nl.begin(); |
4225 | 2.41k | start_piece = mystrsep(nl, iter); |
4226 | 8.20k | while (start_piece != nl.end()) { |
4227 | 5.88k | switch (i) { |
4228 | 2.41k | case 0: { |
4229 | 2.41k | if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) { |
4230 | 95 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4231 | 95 | af->getlinenum()); |
4232 | 95 | nummap = 0; |
4233 | 95 | return false; |
4234 | 95 | } |
4235 | 2.31k | break; |
4236 | 2.41k | } |
4237 | 2.31k | case 1: { |
4238 | 390k | for (auto k = start_piece; k != iter; ++k) { |
4239 | 388k | auto chb = k, che = k + 1; |
4240 | 388k | if (*k == '(') { |
4241 | 2.57k | auto parpos = std::find(k, iter, ')'); |
4242 | 2.57k | if (parpos != iter) { |
4243 | 877 | chb = k + 1; |
4244 | 877 | che = parpos; |
4245 | 877 | k = parpos; |
4246 | 877 | } |
4247 | 385k | } else { |
4248 | 385k | if (utf8 && (*k & 0xc0) == 0xc0) { |
4249 | 2.99k | ++k; |
4250 | 3.47k | while (k != iter && (*k & 0xc0) == 0x80) |
4251 | 477 | ++k; |
4252 | 2.99k | che = k; |
4253 | 2.99k | --k; |
4254 | 2.99k | } |
4255 | 385k | } |
4256 | 388k | if (chb == che) { |
4257 | 511 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4258 | 511 | af->getlinenum()); |
4259 | 511 | } |
4260 | | |
4261 | 388k | maptable.back().emplace_back(chb, che); |
4262 | 388k | } |
4263 | 2.30k | break; |
4264 | 2.41k | } |
4265 | 1.16k | default: |
4266 | 1.16k | break; |
4267 | 5.88k | } |
4268 | 5.79k | ++i; |
4269 | 5.79k | start_piece = mystrsep(nl, iter); |
4270 | 5.79k | } |
4271 | 2.32k | if (maptable.back().empty()) { |
4272 | 15 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4273 | 15 | af->getlinenum()); |
4274 | 15 | return false; |
4275 | 15 | } |
4276 | 2.32k | } |
4277 | 313 | return true; |
4278 | 615 | } |
4279 | | |
4280 | | /* parse in the word breakpoint table */ |
4281 | 1.94k | bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) { |
4282 | 1.94k | if (parsedbreaktable) { |
4283 | 46 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
4284 | 46 | af->getlinenum()); |
4285 | 46 | return false; |
4286 | 46 | } |
4287 | 1.89k | parsedbreaktable = true; |
4288 | 1.89k | int numbreak = -1; |
4289 | 1.89k | int i = 0; |
4290 | 1.89k | int np = 0; |
4291 | 1.89k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
4292 | 4.84k | while (start_piece != line.end()) { |
4293 | 4.46k | switch (i) { |
4294 | 1.89k | case 0: { |
4295 | 1.89k | np++; |
4296 | 1.89k | break; |
4297 | 0 | } |
4298 | 1.81k | case 1: { |
4299 | 1.81k | numbreak = atoi(std::string(start_piece, iter).c_str()); |
4300 | 1.81k | if (numbreak < 0) { |
4301 | 34 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
4302 | 34 | af->getlinenum()); |
4303 | 34 | return false; |
4304 | 34 | } |
4305 | 1.77k | if (numbreak == 0) |
4306 | 1.48k | return true; |
4307 | 297 | breaktable.reserve(std::min(numbreak, 16384)); |
4308 | 297 | np++; |
4309 | 297 | break; |
4310 | 1.77k | } |
4311 | 755 | default: |
4312 | 755 | break; |
4313 | 4.46k | } |
4314 | 2.94k | ++i; |
4315 | 2.94k | start_piece = mystrsep(line, iter); |
4316 | 2.94k | } |
4317 | 381 | if (np != 2) { |
4318 | 84 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
4319 | 84 | af->getlinenum()); |
4320 | 84 | return false; |
4321 | 84 | } |
4322 | | |
4323 | | /* now parse the numbreak lines to read in the remainder of the table */ |
4324 | 4.34k | for (int j = 0; j < numbreak; ++j) { |
4325 | 4.29k | std::string nl; |
4326 | 4.29k | if (!af->getline(nl)) |
4327 | 155 | return false; |
4328 | 4.13k | mychomp(nl); |
4329 | 4.13k | i = 0; |
4330 | 4.13k | iter = nl.begin(); |
4331 | 4.13k | start_piece = mystrsep(nl, iter); |
4332 | 10.8k | while (start_piece != nl.end()) { |
4333 | 6.84k | switch (i) { |
4334 | 2.62k | case 0: { |
4335 | 2.62k | if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) { |
4336 | 89 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4337 | 89 | af->getlinenum()); |
4338 | 89 | numbreak = 0; |
4339 | 89 | return false; |
4340 | 89 | } |
4341 | 2.53k | break; |
4342 | 2.62k | } |
4343 | 2.53k | case 1: { |
4344 | 2.49k | breaktable.emplace_back(start_piece, iter); |
4345 | 2.49k | break; |
4346 | 2.62k | } |
4347 | 1.72k | default: |
4348 | 1.72k | break; |
4349 | 6.84k | } |
4350 | 6.75k | ++i; |
4351 | 6.75k | start_piece = mystrsep(nl, iter); |
4352 | 6.75k | } |
4353 | 4.13k | } |
4354 | | |
4355 | 53 | if (breaktable.size() != static_cast<size_t>(numbreak)) { |
4356 | 3 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
4357 | 3 | af->getlinenum()); |
4358 | 3 | return false; |
4359 | 3 | } |
4360 | | |
4361 | 50 | return true; |
4362 | 53 | } |
4363 | | |
4364 | 28.6k | void AffixMgr::reverse_condition(std::string& piece) { |
4365 | 28.6k | if (piece.empty()) |
4366 | 0 | return; |
4367 | | |
4368 | 28.6k | int neg = 0; |
4369 | 1.81M | for (auto k = piece.rbegin(); k != piece.rend(); ++k) { |
4370 | 1.79M | switch (*k) { |
4371 | 50.4k | case '[': { |
4372 | 50.4k | if (neg) |
4373 | 1.36k | *(k - 1) = '['; |
4374 | 49.1k | else |
4375 | 49.1k | *k = ']'; |
4376 | 50.4k | break; |
4377 | 0 | } |
4378 | 35.5k | case ']': { |
4379 | 35.5k | *k = '['; |
4380 | 35.5k | if (neg) |
4381 | 7.12k | *(k - 1) = '^'; |
4382 | 35.5k | neg = 0; |
4383 | 35.5k | break; |
4384 | 0 | } |
4385 | 35.7k | case '^': { |
4386 | 35.7k | if (*(k - 1) == ']') |
4387 | 9.69k | neg = 1; |
4388 | 26.0k | else if (neg) |
4389 | 11.7k | *(k - 1) = *k; |
4390 | 35.7k | break; |
4391 | 0 | } |
4392 | 1.66M | default: { |
4393 | 1.66M | if (neg) |
4394 | 165k | *(k - 1) = *k; |
4395 | 1.66M | } |
4396 | 1.79M | } |
4397 | 1.79M | } |
4398 | 28.6k | } |
4399 | | |
4400 | | class entries_container { |
4401 | | std::vector<AffEntry*> entries; |
4402 | | AffixMgr* m_mgr; |
4403 | | char m_at; |
4404 | | public: |
4405 | | entries_container(char at, AffixMgr* mgr) |
4406 | 35.1k | : m_mgr(mgr) |
4407 | 35.1k | , m_at(at) { |
4408 | 35.1k | } |
4409 | 33.3k | void release() { |
4410 | 33.3k | entries.clear(); |
4411 | 33.3k | } |
4412 | | void initialize(int numents, |
4413 | 34.4k | char opts, unsigned short aflag) { |
4414 | 34.4k | entries.reserve(std::min(numents, 16384)); |
4415 | | |
4416 | 34.4k | if (m_at == 'P') { |
4417 | 14.0k | entries.push_back(new PfxEntry(m_mgr)); |
4418 | 20.3k | } else { |
4419 | 20.3k | entries.push_back(new SfxEntry(m_mgr)); |
4420 | 20.3k | } |
4421 | | |
4422 | 34.4k | entries.back()->opts = opts; |
4423 | 34.4k | entries.back()->aflag = aflag; |
4424 | 34.4k | } |
4425 | | |
4426 | 19.2k | AffEntry* add_entry(char opts) { |
4427 | 19.2k | if (m_at == 'P') { |
4428 | 9.14k | entries.push_back(new PfxEntry(m_mgr)); |
4429 | 10.1k | } else { |
4430 | 10.1k | entries.push_back(new SfxEntry(m_mgr)); |
4431 | 10.1k | } |
4432 | 19.2k | AffEntry* ret = entries.back(); |
4433 | 19.2k | ret->opts = entries[0]->opts & opts; |
4434 | 19.2k | return ret; |
4435 | 19.2k | } |
4436 | | |
4437 | 53.5k | AffEntry* first_entry() { |
4438 | 53.5k | return entries.empty() ? NULL : entries[0]; |
4439 | 53.5k | } |
4440 | | |
4441 | 35.1k | ~entries_container() { |
4442 | 35.1k | for (auto& entry : entries) { |
4443 | 7.96k | delete entry; |
4444 | 7.96k | } |
4445 | 35.1k | } |
4446 | | |
4447 | 33.3k | std::vector<AffEntry*>::iterator begin() { return entries.begin(); } |
4448 | 33.3k | std::vector<AffEntry*>::iterator end() { return entries.end(); } |
4449 | | }; |
4450 | | |
4451 | | bool AffixMgr::parse_affix(const std::string& line, |
4452 | | const char at, |
4453 | | FileMgr* af, |
4454 | 35.1k | char* dupflags) { |
4455 | 35.1k | int numents = 0; // number of AffEntry structures to parse |
4456 | | |
4457 | 35.1k | unsigned short aflag = 0; // affix char identifier |
4458 | | |
4459 | 35.1k | char ff = 0; |
4460 | 35.1k | entries_container affentries(at, this); |
4461 | | |
4462 | 35.1k | int i = 0; |
4463 | | |
4464 | | // checking lines with bad syntax |
4465 | | #ifdef DEBUG |
4466 | | int basefieldnum = 0; |
4467 | | #endif |
4468 | | |
4469 | | // split affix header line into pieces |
4470 | | |
4471 | 35.1k | int np = 0; |
4472 | 35.1k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
4473 | 189k | while (start_piece != line.end()) { |
4474 | 154k | switch (i) { |
4475 | | // piece 1 - is type of affix |
4476 | 35.1k | case 0: { |
4477 | 35.1k | np++; |
4478 | 35.1k | break; |
4479 | 0 | } |
4480 | | |
4481 | | // piece 2 - is affix char |
4482 | 34.9k | case 1: { |
4483 | 34.9k | np++; |
4484 | 34.9k | aflag = pHMgr->decode_flag(std::string(start_piece, iter)); |
4485 | 34.9k | if (((at == 'S') && (dupflags[aflag] & dupSFX)) || |
4486 | 34.9k | ((at == 'P') && (dupflags[aflag] & dupPFX))) { |
4487 | 13.9k | HUNSPELL_WARNING( |
4488 | 13.9k | stderr, |
4489 | 13.9k | "error: line %d: multiple definitions of an affix flag\n", |
4490 | 13.9k | af->getlinenum()); |
4491 | 13.9k | } |
4492 | 34.9k | dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); |
4493 | 34.9k | break; |
4494 | 0 | } |
4495 | | // piece 3 - is cross product indicator |
4496 | 34.7k | case 2: { |
4497 | 34.7k | np++; |
4498 | 34.7k | if (*start_piece == 'Y') |
4499 | 4.96k | ff = aeXPRODUCT; |
4500 | 34.7k | break; |
4501 | 0 | } |
4502 | | |
4503 | | // piece 4 - is number of affentries |
4504 | 34.5k | case 3: { |
4505 | 34.5k | np++; |
4506 | 34.5k | numents = atoi(std::string(start_piece, iter).c_str()); |
4507 | 34.5k | if ((numents <= 0) || ((std::numeric_limits<size_t>::max() / |
4508 | 34.4k | sizeof(AffEntry)) < static_cast<size_t>(numents))) { |
4509 | 178 | std::string err = pHMgr->encode_flag(aflag); |
4510 | 178 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
4511 | 178 | af->getlinenum()); |
4512 | 178 | return false; |
4513 | 178 | } |
4514 | | |
4515 | 34.4k | char opts = ff; |
4516 | 34.4k | if (utf8) |
4517 | 7.25k | opts |= aeUTF8; |
4518 | 34.4k | if (pHMgr->is_aliasf()) |
4519 | 2.14k | opts |= aeALIASF; |
4520 | 34.4k | if (pHMgr->is_aliasm()) |
4521 | 4.16k | opts |= aeALIASM; |
4522 | 34.4k | affentries.initialize(numents, opts, aflag); |
4523 | 34.4k | } |
4524 | | |
4525 | 49.2k | default: |
4526 | 49.2k | break; |
4527 | 154k | } |
4528 | 154k | ++i; |
4529 | 154k | start_piece = mystrsep(line, iter); |
4530 | 154k | } |
4531 | | // check to make sure we parsed enough pieces |
4532 | 34.9k | if (np != 4) { |
4533 | 506 | std::string err = pHMgr->encode_flag(aflag); |
4534 | 506 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
4535 | 506 | af->getlinenum()); |
4536 | 506 | return false; |
4537 | 506 | } |
4538 | | |
4539 | | // now parse numents affentries for this affix |
4540 | 34.4k | AffEntry* entry = affentries.first_entry(); |
4541 | 87.2k | for (int ent = 0; ent < numents; ++ent) { |
4542 | 53.8k | std::string nl; |
4543 | 53.8k | if (!af->getline(nl)) |
4544 | 316 | return false; |
4545 | 53.5k | mychomp(nl); |
4546 | | |
4547 | 53.5k | iter = nl.begin(); |
4548 | 53.5k | i = 0; |
4549 | 53.5k | np = 0; |
4550 | | |
4551 | | // split line into pieces |
4552 | 53.5k | start_piece = mystrsep(nl, iter); |
4553 | 338k | while (start_piece != nl.end()) { |
4554 | 285k | switch (i) { |
4555 | | // piece 1 - is type |
4556 | 53.5k | case 0: { |
4557 | 53.5k | np++; |
4558 | 53.5k | if (ent != 0) |
4559 | 19.2k | entry = affentries.add_entry((char)(aeXPRODUCT | aeUTF8 | aeALIASF | aeALIASM)); |
4560 | 53.5k | break; |
4561 | 0 | } |
4562 | | |
4563 | | // piece 2 - is affix char |
4564 | 53.3k | case 1: { |
4565 | 53.3k | np++; |
4566 | 53.3k | std::string chunk(start_piece, iter); |
4567 | 53.3k | if (pHMgr->decode_flag(chunk) != aflag) { |
4568 | 285 | std::string err = pHMgr->encode_flag(aflag); |
4569 | 285 | HUNSPELL_WARNING(stderr, |
4570 | 285 | "error: line %d: affix %s is corrupt\n", |
4571 | 285 | af->getlinenum(), err.c_str()); |
4572 | 285 | return false; |
4573 | 285 | } |
4574 | | |
4575 | 53.0k | if (ent != 0) { |
4576 | 19.1k | AffEntry* start_entry = affentries.first_entry(); |
4577 | 19.1k | entry->aflag = start_entry->aflag; |
4578 | 19.1k | } |
4579 | 53.0k | break; |
4580 | 53.3k | } |
4581 | | |
4582 | | // piece 3 - is string to strip or 0 for null |
4583 | 52.9k | case 2: { |
4584 | 52.9k | np++; |
4585 | 52.9k | entry->strip = std::string(start_piece, iter); |
4586 | 52.9k | if (complexprefixes) { |
4587 | 21.6k | if (utf8) |
4588 | 5.24k | reverseword_utf(entry->strip); |
4589 | 16.4k | else |
4590 | 16.4k | reverseword(entry->strip); |
4591 | 21.6k | } |
4592 | 52.9k | if (entry->strip.compare("0") == 0) { |
4593 | 2.58k | entry->strip.clear(); |
4594 | 2.58k | } |
4595 | 52.9k | break; |
4596 | 53.3k | } |
4597 | | |
4598 | | // piece 4 - is affix string or 0 for null |
4599 | 52.8k | case 3: { |
4600 | 52.8k | entry->morphcode = NULL; |
4601 | 52.8k | entry->contclass = NULL; |
4602 | 52.8k | entry->contclasslen = 0; |
4603 | 52.8k | np++; |
4604 | 52.8k | std::string::const_iterator dash = std::find(start_piece, iter, '/'); |
4605 | 52.8k | if (dash != iter) { |
4606 | 17.9k | entry->appnd = std::string(start_piece, dash); |
4607 | 17.9k | std::string dash_str(dash + 1, iter); |
4608 | | |
4609 | 17.9k | if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { |
4610 | 1.35k | if (utf8) { |
4611 | 455 | remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); |
4612 | 900 | } else { |
4613 | 900 | remove_ignored_chars(entry->appnd, ignorechars); |
4614 | 900 | } |
4615 | 1.35k | } |
4616 | | |
4617 | 17.9k | if (complexprefixes) { |
4618 | 6.80k | if (utf8) |
4619 | 1.21k | reverseword_utf(entry->appnd); |
4620 | 5.58k | else |
4621 | 5.58k | reverseword(entry->appnd); |
4622 | 6.80k | } |
4623 | | |
4624 | 17.9k | if (pHMgr->is_aliasf()) { |
4625 | 1.76k | int index = atoi(dash_str.c_str()); |
4626 | 1.76k | entry->contclasslen = (unsigned short)pHMgr->get_aliasf( |
4627 | 1.76k | index, &(entry->contclass), af); |
4628 | 1.76k | if (!entry->contclasslen) |
4629 | 861 | HUNSPELL_WARNING(stderr, |
4630 | 861 | "error: bad affix flag alias: \"%s\"\n", |
4631 | 861 | dash_str.c_str()); |
4632 | 16.1k | } else { |
4633 | 16.1k | entry->contclasslen = (unsigned short)pHMgr->decode_flags( |
4634 | 16.1k | &(entry->contclass), dash_str, af); |
4635 | 16.1k | std::sort(entry->contclass, entry->contclass + entry->contclasslen); |
4636 | 16.1k | } |
4637 | | |
4638 | 17.9k | havecontclass = 1; |
4639 | 736k | for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { |
4640 | 718k | contclasses[(entry->contclass)[_i]] = 1; |
4641 | 718k | } |
4642 | 34.8k | } else { |
4643 | 34.8k | entry->appnd = std::string(start_piece, iter); |
4644 | | |
4645 | 34.8k | if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { |
4646 | 3.58k | if (utf8) { |
4647 | 1.03k | remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); |
4648 | 2.55k | } else { |
4649 | 2.55k | remove_ignored_chars(entry->appnd, ignorechars); |
4650 | 2.55k | } |
4651 | 3.58k | } |
4652 | | |
4653 | 34.8k | if (complexprefixes) { |
4654 | 14.8k | if (utf8) |
4655 | 4.01k | reverseword_utf(entry->appnd); |
4656 | 10.8k | else |
4657 | 10.8k | reverseword(entry->appnd); |
4658 | 14.8k | } |
4659 | 34.8k | } |
4660 | | |
4661 | 52.8k | if (entry->appnd.compare("0") == 0) { |
4662 | 1.72k | entry->appnd.clear(); |
4663 | 1.72k | } |
4664 | 52.8k | break; |
4665 | 53.3k | } |
4666 | | |
4667 | | // piece 5 - is the conditions descriptions |
4668 | 27.8k | case 4: { |
4669 | 27.8k | std::string chunk(start_piece, iter); |
4670 | 27.8k | np++; |
4671 | 27.8k | if (complexprefixes) { |
4672 | 11.8k | if (utf8) |
4673 | 2.32k | reverseword_utf(chunk); |
4674 | 9.54k | else |
4675 | 9.54k | reverseword(chunk); |
4676 | 11.8k | reverse_condition(chunk); |
4677 | 11.8k | } |
4678 | 27.8k | if (!entry->strip.empty() && chunk != "." && |
4679 | 27.8k | redundant_condition(at, entry->strip, chunk, |
4680 | 27.3k | af->getlinenum())) |
4681 | 3.01k | chunk = "."; |
4682 | 27.8k | if (at == 'S') { |
4683 | 16.7k | reverseword(chunk); |
4684 | 16.7k | reverse_condition(chunk); |
4685 | 16.7k | } |
4686 | 27.8k | if (encodeit(*entry, chunk)) |
4687 | 0 | return false; |
4688 | 27.8k | break; |
4689 | 27.8k | } |
4690 | | |
4691 | 27.8k | case 5: { |
4692 | 19.7k | std::string chunk(start_piece, iter); |
4693 | 19.7k | np++; |
4694 | 19.7k | if (pHMgr->is_aliasm()) { |
4695 | 3.05k | int index = atoi(chunk.c_str()); |
4696 | 3.05k | entry->morphcode = pHMgr->get_aliasm(index); |
4697 | 16.6k | } else { |
4698 | 16.6k | if (complexprefixes) { // XXX - fix me for morph. gen. |
4699 | 6.39k | if (utf8) |
4700 | 1.63k | reverseword_utf(chunk); |
4701 | 4.75k | else |
4702 | 4.75k | reverseword(chunk); |
4703 | 6.39k | } |
4704 | | // add the remaining of the line |
4705 | 16.6k | std::string::const_iterator end = nl.end(); |
4706 | 16.6k | if (iter != end) { |
4707 | 8.22k | chunk.append(iter, end); |
4708 | 8.22k | } |
4709 | 16.6k | entry->morphcode = mystrdup(chunk.c_str()); |
4710 | 16.6k | } |
4711 | 19.7k | break; |
4712 | 27.8k | } |
4713 | 25.0k | default: |
4714 | 25.0k | break; |
4715 | 285k | } |
4716 | 284k | i++; |
4717 | 284k | start_piece = mystrsep(nl, iter); |
4718 | 284k | } |
4719 | | // check to make sure we parsed enough pieces |
4720 | 53.2k | if (np < 4) { |
4721 | 459 | std::string err = pHMgr->encode_flag(aflag); |
4722 | 459 | HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", |
4723 | 459 | af->getlinenum(), err.c_str()); |
4724 | 459 | return false; |
4725 | 459 | } |
4726 | | |
4727 | | #ifdef DEBUG |
4728 | | // detect unnecessary fields, excepting comments |
4729 | | if (basefieldnum) { |
4730 | | int fieldnum = |
4731 | | !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); |
4732 | | if (fieldnum != basefieldnum) |
4733 | | HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", |
4734 | | af->getlinenum()); |
4735 | | } else { |
4736 | | basefieldnum = |
4737 | | !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6); |
4738 | | } |
4739 | | #endif |
4740 | 53.2k | } |
4741 | | |
4742 | | // now create SfxEntry or PfxEntry objects and use links to |
4743 | | // build an ordered (sorted by affix string) list |
4744 | 33.3k | auto start = affentries.begin(), end = affentries.end(); |
4745 | 79.0k | for (auto affentry = start; affentry != end; ++affentry) { |
4746 | 45.7k | if (at == 'P') { |
4747 | 18.5k | build_pfxtree(dynamic_cast<PfxEntry*>(*affentry)); |
4748 | 27.1k | } else { |
4749 | 27.1k | build_sfxtree(dynamic_cast<SfxEntry*>(*affentry)); |
4750 | 27.1k | } |
4751 | 45.7k | } |
4752 | | |
4753 | | //contents belong to AffixMgr now |
4754 | 33.3k | affentries.release(); |
4755 | | |
4756 | 33.3k | return true; |
4757 | 34.4k | } |
4758 | | |
4759 | | int AffixMgr::redundant_condition(char ft, |
4760 | | const std::string& strip, |
4761 | | const std::string& cond, |
4762 | 27.3k | int linenum) { |
4763 | 27.3k | int stripl = strip.size(), condl = cond.size(), i, j, neg, in; |
4764 | 27.3k | if (ft == 'P') { // prefix |
4765 | 11.0k | if (strip.compare(0, condl, cond) == 0) |
4766 | 1.30k | return 1; |
4767 | 9.72k | if (utf8) { |
4768 | 8.08k | } else { |
4769 | 13.3k | for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { |
4770 | 12.2k | if (cond[j] != '[') { |
4771 | 8.55k | if (cond[j] != strip[i]) { |
4772 | 4.25k | HUNSPELL_WARNING(stderr, |
4773 | 4.25k | "warning: line %d: incompatible stripping " |
4774 | 4.25k | "characters and condition\n", |
4775 | 4.25k | linenum); |
4776 | 4.25k | return 0; |
4777 | 4.25k | } |
4778 | 8.55k | } else { |
4779 | 3.70k | neg = (cond[j + 1] == '^') ? 1 : 0; |
4780 | 3.70k | in = 0; |
4781 | 333k | do { |
4782 | 333k | j++; |
4783 | 333k | if (strip[i] == cond[j]) |
4784 | 2.03k | in = 1; |
4785 | 333k | } while ((j < (condl - 1)) && (cond[j] != ']')); |
4786 | 3.70k | if (j == (condl - 1) && (cond[j] != ']')) { |
4787 | 668 | HUNSPELL_WARNING(stderr, |
4788 | 668 | "error: line %d: missing ] in condition:\n%s\n", |
4789 | 668 | linenum, cond.c_str()); |
4790 | 668 | return 0; |
4791 | 668 | } |
4792 | 3.03k | if ((!neg && !in) || (neg && in)) { |
4793 | 2.08k | HUNSPELL_WARNING(stderr, |
4794 | 2.08k | "warning: line %d: incompatible stripping " |
4795 | 2.08k | "characters and condition\n", |
4796 | 2.08k | linenum); |
4797 | 2.08k | return 0; |
4798 | 2.08k | } |
4799 | 3.03k | } |
4800 | 12.2k | } |
4801 | 1.08k | if (j >= condl) |
4802 | 275 | return 1; |
4803 | 1.08k | } |
4804 | 16.2k | } else { // suffix |
4805 | 16.2k | if ((stripl >= condl) && strip.compare(stripl - condl, std::string::npos, cond) == 0) |
4806 | 778 | return 1; |
4807 | 15.5k | if (utf8) { |
4808 | 12.6k | } else { |
4809 | 17.7k | for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { |
4810 | 16.7k | if (cond[j] != ']') { |
4811 | 9.31k | if (cond[j] != strip[i]) { |
4812 | 6.47k | HUNSPELL_WARNING(stderr, |
4813 | 6.47k | "warning: line %d: incompatible stripping " |
4814 | 6.47k | "characters and condition\n", |
4815 | 6.47k | linenum); |
4816 | 6.47k | return 0; |
4817 | 6.47k | } |
4818 | 9.31k | } else if (j > 0) { |
4819 | 7.22k | in = 0; |
4820 | 222k | do { |
4821 | 222k | j--; |
4822 | 222k | if (strip[i] == cond[j]) |
4823 | 8.09k | in = 1; |
4824 | 222k | } while ((j > 0) && (cond[j] != '[')); |
4825 | 7.22k | if ((j == 0) && (cond[j] != '[')) { |
4826 | 1.93k | HUNSPELL_WARNING(stderr, |
4827 | 1.93k | "error: line: %d: missing ] in condition:\n%s\n", |
4828 | 1.93k | linenum, cond.c_str()); |
4829 | 1.93k | return 0; |
4830 | 1.93k | } |
4831 | 5.28k | neg = (cond[j + 1] == '^') ? 1 : 0; |
4832 | 5.28k | if ((!neg && !in) || (neg && in)) { |
4833 | 3.19k | HUNSPELL_WARNING(stderr, |
4834 | 3.19k | "warning: line %d: incompatible stripping " |
4835 | 3.19k | "characters and condition\n", |
4836 | 3.19k | linenum); |
4837 | 3.19k | return 0; |
4838 | 3.19k | } |
4839 | 5.28k | } |
4840 | 16.7k | } |
4841 | 1.01k | if (j < 0) |
4842 | 660 | return 1; |
4843 | 1.01k | } |
4844 | 15.5k | } |
4845 | 5.69k | return 0; |
4846 | 27.3k | } |
4847 | | |
4848 | | std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff, |
4849 | | int len, |
4850 | 0 | const std::string& root_word) { |
4851 | 0 | std::vector<std::string> slst; |
4852 | 0 | short unsigned* start_ptr = suff; |
4853 | 0 | for (auto ptr : sStart) { |
4854 | 0 | while (ptr) { |
4855 | 0 | suff = start_ptr; |
4856 | 0 | for (int i = 0; i < len; i++) { |
4857 | 0 | if ((*suff) == ptr->getFlag()) { |
4858 | 0 | std::string nw(root_word); |
4859 | 0 | nw.append(ptr->getAffix()); |
4860 | 0 | hentry* ht = ptr->checkword(nw, 0, nw.size(), 0, NULL, 0, 0, 0); |
4861 | 0 | if (ht) { |
4862 | 0 | slst.push_back(nw); |
4863 | 0 | } |
4864 | 0 | } |
4865 | 0 | suff++; |
4866 | 0 | } |
4867 | 0 | ptr = ptr->getNext(); |
4868 | 0 | } |
4869 | 0 | } |
4870 | 0 | return slst; |
4871 | 0 | } |