/src/hunspell/src/hunspell/hashmgr.cxx
Line | Count | Source |
1 | | /* ***** BEGIN LICENSE BLOCK ***** |
2 | | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
3 | | * |
4 | | * Copyright (C) 2002-2022 Németh László |
5 | | * |
6 | | * The contents of this file are subject to the Mozilla Public License Version |
7 | | * 1.1 (the "License"); you may not use this file except in compliance with |
8 | | * the License. You may obtain a copy of the License at |
9 | | * http://www.mozilla.org/MPL/ |
10 | | * |
11 | | * Software distributed under the License is distributed on an "AS IS" basis, |
12 | | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
13 | | * for the specific language governing rights and limitations under the |
14 | | * License. |
15 | | * |
16 | | * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. |
17 | | * |
18 | | * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
19 | | * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
20 | | * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
21 | | * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
22 | | * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
23 | | * |
24 | | * Alternatively, the contents of this file may be used under the terms of |
25 | | * either the GNU General Public License Version 2 or later (the "GPL"), or |
26 | | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
27 | | * in which case the provisions of the GPL or the LGPL are applicable instead |
28 | | * of those above. If you wish to allow use of your version of this file only |
29 | | * under the terms of either the GPL or the LGPL, and not to allow others to |
30 | | * use your version of this file under the terms of the MPL, indicate your |
31 | | * decision by deleting the provisions above and replace them with the notice |
32 | | * and other provisions required by the GPL or the LGPL. If you do not delete |
33 | | * the provisions above, a recipient may use your version of this file under |
34 | | * the terms of any one of the MPL, the GPL or the LGPL. |
35 | | * |
36 | | * ***** END LICENSE BLOCK ***** */ |
37 | | /* |
38 | | * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada |
39 | | * And Contributors. All rights reserved. |
40 | | * |
41 | | * Redistribution and use in source and binary forms, with or without |
42 | | * modification, are permitted provided that the following conditions |
43 | | * are met: |
44 | | * |
45 | | * 1. Redistributions of source code must retain the above copyright |
46 | | * notice, this list of conditions and the following disclaimer. |
47 | | * |
48 | | * 2. Redistributions in binary form must reproduce the above copyright |
49 | | * notice, this list of conditions and the following disclaimer in the |
50 | | * documentation and/or other materials provided with the distribution. |
51 | | * |
52 | | * 3. All modifications to the source code must be clearly marked as |
53 | | * such. Binary redistributions based on modified source code |
54 | | * must be clearly marked as modified versions in the documentation |
55 | | * and/or other materials provided with the distribution. |
56 | | * |
57 | | * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS |
58 | | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
59 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
60 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
61 | | * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
62 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
63 | | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
64 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
65 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
66 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
67 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
68 | | * SUCH DAMAGE. |
69 | | */ |
70 | | |
71 | | #include <cstdlib> |
72 | | #include <cstring> |
73 | | #include <cstdio> |
74 | | #include <cctype> |
75 | | #include <limits> |
76 | | #include <sstream> |
77 | | #if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) |
78 | | #include <bit> |
79 | | #endif |
80 | | |
81 | | #include "hashmgr.hxx" |
82 | | #include "csutil.hxx" |
83 | | #include "atypes.hxx" |
84 | | #include "langnum.hxx" |
85 | | |
86 | | // build a hash table from a munched word list |
87 | | |
88 | | HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) |
89 | 15.2k | : flag_mode(FLAG_CHAR), |
90 | 15.2k | complexprefixes(0), |
91 | 15.2k | utf8(0), |
92 | 15.2k | forbiddenword(FORBIDDENWORD), // forbidden word signing flag |
93 | 15.2k | langnum(0), |
94 | 15.2k | csconv(NULL) |
95 | 15.2k | { |
96 | 15.2k | load_config(apath, key); |
97 | 15.2k | if (!csconv) |
98 | 14.5k | csconv = get_current_cs(SPELL_ENCODING); |
99 | 15.2k | int ec = load_tables(tpath, key); |
100 | 15.2k | if (ec) { |
101 | | /* error condition - what should we do here */ |
102 | 7.85k | HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec); |
103 | 7.85k | free_table(); |
104 | | //keep table size to 1 to fix possible division with zero |
105 | 7.85k | tableptr.resize(1, nullptr); |
106 | 7.85k | } |
107 | 15.2k | } |
108 | | |
109 | 489k | void HashMgr::free_flag(unsigned short* astr, int alen) { |
110 | 489k | if (astr && (aliasf.empty() || TESTAFF(astr, ONLYUPCASEFLAG, alen))) |
111 | 188k | delete[] astr; |
112 | 489k | } |
113 | | |
114 | 23.0k | void HashMgr::free_table() { |
115 | | // now pass through hash table freeing up everything |
116 | | // go through column by column of the table |
117 | 9.83M | for (auto ptr : tableptr) { |
118 | 9.83M | hentry* nt = NULL; |
119 | 10.3M | while (ptr) { |
120 | 489k | nt = ptr->next; |
121 | 489k | free_flag(ptr->astr, ptr->alen); |
122 | 489k | free(ptr); |
123 | 489k | ptr = nt; |
124 | 489k | } |
125 | 9.83M | } |
126 | 23.0k | tableptr.clear(); |
127 | 23.0k | } |
128 | | |
129 | 15.2k | HashMgr::~HashMgr() { |
130 | 15.2k | free_table(); |
131 | | |
132 | 15.2k | for (auto& j : aliasf) |
133 | 507 | delete[] j; |
134 | 15.2k | aliasf.clear(); |
135 | | |
136 | 15.2k | for (auto& j : aliasm) |
137 | 731 | delete[] j; |
138 | 15.2k | aliasm.clear(); |
139 | | |
140 | | #ifdef MOZILLA_CLIENT |
141 | | delete[] csconv; |
142 | | #endif |
143 | 15.2k | } |
144 | | |
145 | | // lookup a root word in the hashtable |
146 | | |
147 | 2.82G | struct hentry* HashMgr::lookup(const char* word, size_t len) const { |
148 | 2.82G | struct hentry* dp = tableptr[hash(word, len)]; |
149 | 2.82G | if (!dp) |
150 | 2.43G | return NULL; |
151 | 824M | for (; dp != NULL; dp = dp->next) { |
152 | 593M | if (strcmp(word, dp->word) == 0) |
153 | 161M | return dp; |
154 | 593M | } |
155 | 231M | return NULL; |
156 | 393M | } |
157 | | |
158 | | // add a word to the hash table (private) |
159 | | int HashMgr::add_word(const std::string& in_word, |
160 | | int wcl, |
161 | | unsigned short* aff, |
162 | | int al, |
163 | | const std::string* in_desc, |
164 | | bool onlyupcase, |
165 | 544k | int captype) { |
166 | | |
167 | 544k | if (al > std::numeric_limits<short>::max()) { |
168 | 7 | HUNSPELL_WARNING(stderr, "error: affix len %d is over max limit\n", al); |
169 | 7 | free_flag(aff, al); |
170 | 7 | return 1; |
171 | 7 | } |
172 | | |
173 | 544k | const std::string* word = &in_word; |
174 | 544k | const std::string* desc = in_desc; |
175 | | |
176 | 544k | std::string *word_copy = NULL; |
177 | 544k | std::string *desc_copy = NULL; |
178 | 544k | if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) { |
179 | 298k | word_copy = new std::string(in_word); |
180 | | |
181 | 298k | if (!ignorechars.empty()) { |
182 | 105k | if (utf8) { |
183 | 30.7k | wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16); |
184 | 74.6k | } else { |
185 | 74.6k | remove_ignored_chars(*word_copy, ignorechars); |
186 | 74.6k | } |
187 | 105k | } |
188 | | |
189 | 298k | if (complexprefixes) { |
190 | 253k | if (utf8) |
191 | 32.6k | wcl = reverseword_utf(*word_copy); |
192 | 221k | else |
193 | 221k | reverseword(*word_copy); |
194 | | |
195 | 253k | if (in_desc && aliasm.empty()) { |
196 | 33.1k | desc_copy = new std::string(*in_desc); |
197 | | |
198 | 33.1k | if (complexprefixes) { |
199 | 33.1k | if (utf8) |
200 | 5.94k | reverseword_utf(*desc_copy); |
201 | 27.1k | else |
202 | 27.1k | reverseword(*desc_copy); |
203 | 33.1k | } |
204 | 33.1k | desc = desc_copy; |
205 | 33.1k | } |
206 | 253k | } |
207 | | |
208 | 298k | word = word_copy; |
209 | 298k | } |
210 | | |
211 | | // limit of hp->blen |
212 | 544k | if (word->size() > std::numeric_limits<unsigned short>::max()) { |
213 | 5 | HUNSPELL_WARNING(stderr, "error: word len %ld is over max limit\n", word->size()); |
214 | 5 | delete desc_copy; |
215 | 5 | delete word_copy; |
216 | 5 | free_flag(aff, al); |
217 | 5 | return 1; |
218 | 5 | } |
219 | | |
220 | 544k | bool upcasehomonym = false; |
221 | 544k | int descl = desc ? (!aliasm.empty() ? sizeof(char*) : desc->size() + 1) : 0; |
222 | | // variable-length hash record with word and optional fields |
223 | 544k | auto hp = |
224 | 544k | (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl); |
225 | 544k | if (!hp) { |
226 | 0 | delete desc_copy; |
227 | 0 | delete word_copy; |
228 | 0 | free_flag(aff, al); |
229 | 0 | return 1; |
230 | 0 | } |
231 | | |
232 | 544k | char* hpw = hp->word; |
233 | 544k | memcpy(hpw, word->data(), word->size()); |
234 | 544k | hpw[word->size()] = 0; |
235 | | |
236 | 544k | int i = hash(hpw, word->size()); |
237 | | |
238 | 544k | hp->blen = (unsigned short)word->size(); |
239 | 544k | hp->clen = (unsigned short)wcl; |
240 | 544k | hp->alen = (short)al; |
241 | 544k | hp->astr = aff; |
242 | 544k | hp->next = NULL; |
243 | 544k | hp->next_homonym = NULL; |
244 | 544k | hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0; |
245 | | |
246 | | // store the description string or its pointer |
247 | 544k | if (desc) { |
248 | 101k | hp->var |= H_OPT; |
249 | 101k | if (!aliasm.empty()) { |
250 | 17.4k | hp->var |= H_OPT_ALIASM; |
251 | 17.4k | store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str()))); |
252 | 83.7k | } else { |
253 | 83.7k | strcpy(hpw + word->size() + 1, desc->c_str()); |
254 | 83.7k | } |
255 | 101k | if (HENTRY_FIND(hp, MORPH_PHON)) { |
256 | 28.0k | hp->var |= H_OPT_PHON; |
257 | | // store ph: fields (pronounciation, misspellings, old orthography etc.) |
258 | | // of a morphological description in reptable to use in REP replacements. |
259 | 28.0k | size_t predicted = tableptr.size() / MORPH_PHON_RATIO; |
260 | 28.0k | if (reptable.capacity() < predicted) |
261 | 1.25k | reptable.reserve(predicted); |
262 | 28.0k | std::string fields = HENTRY_DATA(hp); |
263 | 28.0k | std::string::const_iterator iter = fields.begin(), start_piece = mystrsep(fields, iter); |
264 | 93.4k | while (start_piece != fields.end()) { |
265 | 65.3k | if (std::string(start_piece, iter).find(MORPH_PHON) == 0) { |
266 | 27.9k | std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1); |
267 | 27.9k | if (!ph.empty()) { |
268 | 27.2k | std::vector<w_char> w; |
269 | 27.2k | size_t strippatt; |
270 | 27.2k | std::string wordpart; |
271 | | // dictionary based REP replacement, separated by "->" |
272 | | // for example "pretty ph:prity ph:priti->pretti" to handle |
273 | | // both prity -> pretty and pritier -> prettiest suggestions. |
274 | 27.2k | if (((strippatt = ph.find("->")) != std::string::npos) && |
275 | 5.77k | (strippatt > 0) && (strippatt < ph.size() - 2)) { |
276 | 5.68k | wordpart = ph.substr(strippatt + 2); |
277 | 5.68k | ph.erase(ph.begin() + strippatt, ph.end()); |
278 | 5.68k | } else |
279 | 21.5k | wordpart = in_word; |
280 | | // when the ph: field ends with the character *, |
281 | | // strip last character of the pattern and the replacement |
282 | | // to match in REP suggestions also at character changes, |
283 | | // for example, "pretty ph:prity*" results "prit->prett" |
284 | | // REP replacement instead of "prity->pretty", to get |
285 | | // prity->pretty and pritiest->prettiest suggestions. |
286 | 27.2k | if (ph.at(ph.size()-1) == '*') { |
287 | 7.34k | strippatt = 1; |
288 | 7.34k | size_t stripword = 0; |
289 | 7.34k | if (utf8) { |
290 | 19.3k | while ((strippatt < ph.size()) && |
291 | 18.9k | ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80)) |
292 | 13.1k | ++strippatt; |
293 | 7.79k | while ((stripword < wordpart.size()) && |
294 | 7.33k | ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80)) |
295 | 1.57k | ++stripword; |
296 | 6.21k | } |
297 | 7.34k | ++strippatt; |
298 | 7.34k | ++stripword; |
299 | 7.34k | if ((ph.size() > strippatt) && (wordpart.size() > stripword)) { |
300 | 6.23k | ph.erase(ph.size()-strippatt, strippatt); |
301 | 6.23k | wordpart.erase(wordpart.size()-stripword, stripword); |
302 | 6.23k | } |
303 | 7.34k | } |
304 | | // capitalize lowercase pattern for capitalized words to support |
305 | | // good suggestions also for capitalized misspellings, eg. |
306 | | // Wednesday ph:wendsay |
307 | | // results wendsay -> Wednesday and Wendsay -> Wednesday, too. |
308 | 27.2k | if (captype == INITCAP) { |
309 | 8.61k | std::string ph_capitalized; |
310 | 8.61k | if (utf8) { |
311 | 5.89k | u8_u16(w, ph); |
312 | 5.89k | if (get_captype_utf8(w, langnum) == NOCAP) { |
313 | 5.24k | mkinitcap_utf(w, langnum); |
314 | 5.24k | u16_u8(ph_capitalized, w); |
315 | 5.24k | } |
316 | 5.89k | } else if (get_captype(ph, csconv) == NOCAP) |
317 | 2.05k | mkinitcap(ph_capitalized, csconv); |
318 | | |
319 | 8.61k | if (!ph_capitalized.empty()) { |
320 | | // add also lowercase word in the case of German or |
321 | | // Hungarian to support lowercase suggestions lowercased by |
322 | | // compound word generation or derivational suffixes |
323 | | // (for example by adjectival suffix "-i" of geographical |
324 | | // names in Hungarian: |
325 | | // Massachusetts ph:messzecsuzec |
326 | | // messzecsuzeci -> massachusettsi (adjective) |
327 | | // For lowercasing by conditional PFX rules, see |
328 | | // tests/germancompounding test example or the |
329 | | // Hungarian dictionary.) |
330 | 5.24k | if (langnum == LANG_de || langnum == LANG_hu) { |
331 | 3.21k | std::string wordpart_lower(wordpart); |
332 | 3.21k | if (utf8) { |
333 | 3.21k | u8_u16(w, wordpart_lower); |
334 | 3.21k | mkallsmall_utf(w, langnum); |
335 | 3.21k | u16_u8(wordpart_lower, w); |
336 | 3.21k | } else { |
337 | 0 | mkallsmall(wordpart_lower, csconv); |
338 | 0 | } |
339 | 3.21k | reptable.emplace_back(); |
340 | 3.21k | reptable.back().pattern.assign(ph); |
341 | 3.21k | reptable.back().outstrings[0].assign(wordpart_lower); |
342 | 3.21k | } |
343 | 5.24k | reptable.emplace_back(); |
344 | 5.24k | reptable.back().pattern.assign(ph_capitalized); |
345 | 5.24k | reptable.back().outstrings[0].assign(wordpart); |
346 | 5.24k | } |
347 | 8.61k | } |
348 | 27.2k | reptable.emplace_back(); |
349 | 27.2k | reptable.back().pattern.assign(ph); |
350 | 27.2k | reptable.back().outstrings[0].assign(wordpart); |
351 | 27.2k | } |
352 | 27.9k | } |
353 | 65.3k | start_piece = mystrsep(fields, iter); |
354 | 65.3k | } |
355 | 28.0k | } |
356 | 101k | } |
357 | | |
358 | 544k | struct hentry* dp = tableptr[i]; |
359 | 544k | if (!dp) { |
360 | 175k | tableptr[i] = hp; |
361 | 175k | delete desc_copy; |
362 | 175k | delete word_copy; |
363 | 175k | return 0; |
364 | 175k | } |
365 | 22.3M | while (dp->next != NULL) { |
366 | 21.9M | if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { |
367 | | // remove hidden onlyupcase homonym |
368 | 22.2k | if (!onlyupcase) { |
369 | 13.0k | if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { |
370 | 221 | delete[] dp->astr; |
371 | 221 | dp->astr = hp->astr; |
372 | 221 | dp->alen = hp->alen; |
373 | 221 | free(hp); |
374 | 221 | delete desc_copy; |
375 | 221 | delete word_copy; |
376 | 221 | return 0; |
377 | 12.8k | } else { |
378 | 12.8k | dp->next_homonym = hp; |
379 | 12.8k | } |
380 | 13.0k | } else { |
381 | 9.21k | upcasehomonym = true; |
382 | 9.21k | } |
383 | 22.2k | } |
384 | 21.9M | dp = dp->next; |
385 | 21.9M | } |
386 | 369k | if (strcmp(hp->word, dp->word) == 0) { |
387 | | // remove hidden onlyupcase homonym |
388 | 237k | if (!onlyupcase) { |
389 | 192k | if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { |
390 | 738 | delete[] dp->astr; |
391 | 738 | dp->astr = hp->astr; |
392 | 738 | dp->alen = hp->alen; |
393 | 738 | free(hp); |
394 | 738 | delete desc_copy; |
395 | 738 | delete word_copy; |
396 | 738 | return 0; |
397 | 191k | } else { |
398 | 191k | dp->next_homonym = hp; |
399 | 191k | } |
400 | 192k | } else { |
401 | 45.0k | upcasehomonym = true; |
402 | 45.0k | } |
403 | 237k | } |
404 | 368k | if (!upcasehomonym) { |
405 | 314k | dp->next = hp; |
406 | 314k | } else { |
407 | | // remove hidden onlyupcase homonym |
408 | 54.2k | delete[] hp->astr; |
409 | 54.2k | free(hp); |
410 | 54.2k | } |
411 | | |
412 | 368k | delete desc_copy; |
413 | 368k | delete word_copy; |
414 | 368k | return 0; |
415 | 369k | } |
416 | | |
417 | | int HashMgr::add_hidden_capitalized_word(const std::string& word, |
418 | | int wcl, |
419 | | unsigned short* flags, |
420 | | int flagslen, |
421 | | const std::string* dp, |
422 | 386k | int captype) { |
423 | 386k | if (flags == NULL) |
424 | 300k | flagslen = 0; |
425 | | |
426 | | // add inner capitalized forms to handle the following allcap forms: |
427 | | // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG |
428 | | // Allcaps with suffixes: CIA's -> CIA'S |
429 | 386k | if (((captype == HUHCAP) || (captype == HUHINITCAP) || |
430 | 242k | ((captype == ALLCAP) && (flagslen != 0))) && |
431 | 159k | !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) { |
432 | 158k | unsigned short* flags2 = new unsigned short[flagslen + 1]; |
433 | 158k | flags2[flagslen] = ONLYUPCASEFLAG; |
434 | 158k | if (flagslen) { |
435 | 60.2k | memcpy(flags2, flags, flagslen * sizeof(unsigned short)); |
436 | 60.2k | std::sort(flags2, flags2 + flagslen + 1); |
437 | 60.2k | } |
438 | 158k | if (utf8) { |
439 | 28.3k | std::string st; |
440 | 28.3k | std::vector<w_char> w; |
441 | 28.3k | u8_u16(w, word); |
442 | 28.3k | mkallsmall_utf(w, langnum); |
443 | 28.3k | mkinitcap_utf(w, langnum); |
444 | 28.3k | u16_u8(st, w); |
445 | 28.3k | return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP); |
446 | 129k | } else { |
447 | 129k | std::string new_word(word); |
448 | 129k | mkallsmall(new_word, csconv); |
449 | 129k | mkinitcap(new_word, csconv); |
450 | 129k | int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP); |
451 | 129k | return ret; |
452 | 129k | } |
453 | 158k | } |
454 | 228k | return 0; |
455 | 386k | } |
456 | | |
457 | | // detect captype and modify word length for UTF-8 encoding |
458 | 386k | int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) { |
459 | 386k | int len; |
460 | 386k | if (utf8) { |
461 | 112k | len = u8_u16(workbuf, word); |
462 | 112k | *captype = get_captype_utf8(workbuf, langnum); |
463 | 273k | } else { |
464 | 273k | len = word.size(); |
465 | 273k | *captype = get_captype(word, csconv); |
466 | 273k | } |
467 | 386k | return len; |
468 | 386k | } |
469 | | |
470 | 0 | int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { |
471 | 0 | std::vector<w_char> workbuf; |
472 | 0 | return get_clen_and_captype(word, captype, workbuf); |
473 | 0 | } |
474 | | |
475 | | // remove word (personal dictionary function for standalone applications) |
476 | 0 | int HashMgr::remove(const std::string& word) { |
477 | 0 | struct hentry* dp = lookup(word.c_str(), word.size()); |
478 | 0 | while (dp) { |
479 | 0 | if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { |
480 | 0 | auto flags = new unsigned short[dp->alen + 1]; |
481 | 0 | for (int i = 0; i < dp->alen; i++) |
482 | 0 | flags[i] = dp->astr[i]; |
483 | 0 | flags[dp->alen] = forbiddenword; |
484 | 0 | delete[] dp->astr; |
485 | 0 | dp->astr = flags; |
486 | 0 | dp->alen++; |
487 | 0 | std::sort(flags, flags + dp->alen); |
488 | 0 | } |
489 | 0 | dp = dp->next_homonym; |
490 | 0 | } |
491 | 0 | return 0; |
492 | 0 | } |
493 | | |
494 | | /* remove forbidden flag to add a personal word to the hash */ |
495 | 0 | void HashMgr::remove_forbidden_flag(const std::string& word) { |
496 | 0 | struct hentry* dp = lookup(word.c_str(), word.size()); |
497 | 0 | if (!dp) |
498 | 0 | return; |
499 | 0 | while (dp) { |
500 | 0 | if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) |
501 | 0 | dp->alen = 0; // XXX forbidden words of personal dic. |
502 | 0 | dp = dp->next_homonym; |
503 | 0 | } |
504 | 0 | } |
505 | | |
506 | | // add a custom dic. word to the hash table (public) |
507 | 0 | int HashMgr::add(const std::string& word) { |
508 | 0 | remove_forbidden_flag(word); |
509 | 0 | int captype, al = 0; |
510 | 0 | unsigned short* flags = NULL; |
511 | 0 | int wcl = get_clen_and_captype(word, &captype); |
512 | 0 | add_word(word, wcl, flags, al, NULL, false, captype); |
513 | 0 | return add_hidden_capitalized_word(word, wcl, flags, al, NULL, |
514 | 0 | captype); |
515 | 0 | } |
516 | | |
517 | 0 | int HashMgr::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) { |
518 | 0 | remove_forbidden_flag(word); |
519 | 0 | int captype; |
520 | 0 | unsigned short *df; |
521 | 0 | int al = decode_flags(&df, flags, NULL); |
522 | 0 | int wcl = get_clen_and_captype(word, &captype); |
523 | 0 | add_word(word, wcl, df, al, &desc, false, captype); |
524 | 0 | return add_hidden_capitalized_word(word, wcl, df, al, &desc, captype); |
525 | 0 | } |
526 | | |
527 | 0 | int HashMgr::add_with_affix(const std::string& word, const std::string& example) { |
528 | | // detect captype and modify word length for UTF-8 encoding |
529 | 0 | struct hentry* dp = lookup(example.c_str(), example.size()); |
530 | 0 | remove_forbidden_flag(word); |
531 | 0 | if (dp && dp->astr) { |
532 | 0 | int captype; |
533 | 0 | int wcl = get_clen_and_captype(word, &captype); |
534 | 0 | if (!aliasf.empty()) { |
535 | 0 | add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype); |
536 | 0 | } else { |
537 | 0 | auto flags = new unsigned short[dp->alen]; |
538 | 0 | memcpy(flags, dp->astr, dp->alen * sizeof(unsigned short)); |
539 | 0 | add_word(word, wcl, flags, dp->alen, NULL, false, captype); |
540 | 0 | } |
541 | 0 | return add_hidden_capitalized_word(word, wcl, dp->astr, |
542 | 0 | dp->alen, NULL, captype); |
543 | 0 | } |
544 | 0 | return 1; |
545 | 0 | } |
546 | | |
547 | | // walk the hash table entry by entry - null at end |
548 | | // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); |
549 | 3.84M | struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { |
550 | 3.84M | if (hp && hp->next != NULL) |
551 | 1.89M | return hp->next; |
552 | 77.7M | for (col++; col < (int)tableptr.size(); ++col) { |
553 | 77.7M | if (tableptr[col]) |
554 | 1.86M | return tableptr[col]; |
555 | 77.7M | } |
556 | | // null at end and reset to start |
557 | 91.5k | col = -1; |
558 | 91.5k | return NULL; |
559 | 1.95M | } |
560 | | |
561 | | // load a munched word list and build a hash table on the fly |
562 | 15.2k | int HashMgr::load_tables(const char* tpath, const char* key) { |
563 | | // open dictionary file |
564 | 15.2k | FileMgr* dict = new FileMgr(tpath, key); |
565 | 15.2k | if (dict == NULL) |
566 | 0 | return 1; |
567 | | |
568 | | // first read the first line of file to get hash table size |
569 | 15.2k | std::string ts; |
570 | 15.2k | if (!dict->getline(ts)) { |
571 | 343 | HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath); |
572 | 343 | delete dict; |
573 | 343 | return 2; |
574 | 343 | } |
575 | 14.8k | mychomp(ts); |
576 | | |
577 | | /* remove byte order mark */ |
578 | 14.8k | if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { |
579 | 1 | ts.erase(0, 3); |
580 | 1 | } |
581 | | |
582 | 14.8k | int tablesize = atoi(ts.c_str()); |
583 | | |
584 | 14.8k | const int nExtra = 5 + USERWORD; |
585 | | #if !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) |
586 | | const int max_allowed = (std::numeric_limits<int>::max() - 1 - nExtra) / int(sizeof(struct hentry*)); |
587 | | #else |
588 | 14.8k | const int max_allowed = (100000 - 1 - nExtra) / int(sizeof(struct hentry*)); |
589 | 14.8k | #endif |
590 | | |
591 | 14.8k | if (tablesize <= 0 || tablesize >= max_allowed) { |
592 | 7.45k | HUNSPELL_WARNING( |
593 | 7.45k | stderr, "error: line 1: missing or bad word count in the dic file\n"); |
594 | 7.45k | delete dict; |
595 | 7.45k | return 4; |
596 | 7.45k | } |
597 | 7.43k | tablesize += nExtra; |
598 | 7.43k | if ((tablesize & 1) == 0) |
599 | 4.94k | tablesize++; |
600 | | |
601 | | // allocate the hash table |
602 | 7.43k | tableptr.resize(tablesize, nullptr); |
603 | | |
604 | | // loop through all words on much list and add to hash |
605 | | // table and create word and affix strings |
606 | | |
607 | 7.43k | std::vector<w_char> workbuf; |
608 | | |
609 | 7.43k | int nLineCount(0); |
610 | 394k | while (dict->getline(ts)) { |
611 | 386k | ++nLineCount; |
612 | 386k | mychomp(ts); |
613 | | // split each line into word and morphological description |
614 | 386k | size_t dp_pos = 0; |
615 | 416k | while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) { |
616 | 45.6k | if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) { |
617 | 17.0k | for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos) |
618 | 1.83k | ; |
619 | 15.2k | if (dp_pos == 0) { // missing word |
620 | 354 | dp_pos = std::string::npos; |
621 | 14.8k | } else { |
622 | 14.8k | ++dp_pos; |
623 | 14.8k | } |
624 | 15.2k | break; |
625 | 15.2k | } |
626 | 30.4k | ++dp_pos; |
627 | 30.4k | } |
628 | | |
629 | | // tabulator is the old morphological field separator |
630 | 386k | size_t dp2_pos = ts.find('\t'); |
631 | 386k | if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) { |
632 | 71.7k | dp_pos = dp2_pos + 1; |
633 | 71.7k | } |
634 | | |
635 | 386k | std::string dp; |
636 | 386k | if (dp_pos != std::string::npos) { |
637 | 76.0k | dp.assign(ts.substr(dp_pos)); |
638 | 76.0k | ts.resize(dp_pos - 1); |
639 | 76.0k | } |
640 | | |
641 | | // split each line into word and affix char strings |
642 | | // "\/" signs slash in words (not affix separator) |
643 | | // "/" at beginning of the line is word character (not affix separator) |
644 | 386k | size_t ap_pos = ts.find('/'); |
645 | 391k | while (ap_pos != std::string::npos) { |
646 | 101k | if (ap_pos == 0) { |
647 | 3.97k | ++ap_pos; |
648 | 3.97k | continue; |
649 | 97.5k | } else if (ts[ap_pos - 1] != '\\') |
650 | 96.8k | break; |
651 | | // replace "\/" with "/" |
652 | 649 | ts.erase(ap_pos - 1, 1); |
653 | 649 | ap_pos = ts.find('/', ap_pos); |
654 | 649 | } |
655 | | |
656 | 386k | unsigned short* flags; |
657 | 386k | int al; |
658 | 386k | if (ap_pos != std::string::npos && ap_pos != ts.size()) { |
659 | 96.2k | std::string ap(ts.substr(ap_pos + 1)); |
660 | 96.2k | ts.resize(ap_pos); |
661 | 96.2k | if (!aliasf.empty()) { |
662 | 4.04k | int index = atoi(ap.c_str()); |
663 | 4.04k | al = get_aliasf(index, &flags, dict); |
664 | 4.04k | if (!al) { |
665 | 3.00k | HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", |
666 | 3.00k | dict->getlinenum()); |
667 | 3.00k | } |
668 | 92.2k | } else { |
669 | 92.2k | al = decode_flags(&flags, ap, dict); |
670 | 92.2k | if (al == -1) { |
671 | 0 | HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); |
672 | 0 | delete dict; |
673 | 0 | return 6; |
674 | 0 | } |
675 | 92.2k | std::sort(flags, flags + al); |
676 | 92.2k | } |
677 | 290k | } else { |
678 | 290k | al = 0; |
679 | 290k | flags = NULL; |
680 | 290k | } |
681 | | |
682 | 386k | int captype; |
683 | 386k | int wcl = get_clen_and_captype(ts, &captype, workbuf); |
684 | 386k | const std::string *dp_str = dp.empty() ? NULL : &dp; |
685 | | // add the word and its index plus its capitalized form optionally |
686 | 386k | if (add_word(ts, wcl, flags, al, dp_str, false, captype) || |
687 | 386k | add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) { |
688 | 12 | delete dict; |
689 | 12 | return 5; |
690 | 12 | } |
691 | 386k | } |
692 | | |
693 | 7.42k | int ret(0); |
694 | | |
695 | | // reject ludicrous tablesizes |
696 | 7.42k | if (tablesize > 8192 + nExtra && tablesize > nLineCount * 10 + nExtra) { |
697 | 42 | HUNSPELL_WARNING(stderr, ".dic initial approximate word count line value of %d is too large for %d lines\n", tablesize, nLineCount); |
698 | 42 | ret = 3; |
699 | 42 | } |
700 | | |
701 | 7.42k | delete dict; |
702 | 7.42k | return ret; |
703 | 7.43k | } |
704 | | |
705 | | // the hash function is a simple load and rotate |
706 | | // algorithm borrowed |
707 | 2.82G | int HashMgr::hash(const char* word, size_t len) const { |
708 | 2.82G | unsigned long hv = 0; |
709 | 2.82G | size_t i = 0; |
710 | 13.1G | while (i < 4 && i < len) |
711 | 10.3G | hv = (hv << 8) | word[i++]; |
712 | 138G | while (i < len) { |
713 | 136G | ROTATE(hv, ROTATE_LEN); |
714 | 136G | hv ^= word[i++]; |
715 | 136G | } |
716 | 2.82G | return (unsigned long)hv % tableptr.size(); |
717 | 2.82G | } |
718 | | |
719 | 111k | int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const { |
720 | 111k | int len; |
721 | 111k | if (flags.empty()) { |
722 | 11.8k | *result = NULL; |
723 | 11.8k | return 0; |
724 | 11.8k | } |
725 | 99.8k | switch (flag_mode) { |
726 | 10.2k | case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) |
727 | 10.2k | len = flags.size(); |
728 | 10.2k | if ((len & 1) == 1 && af != NULL) |
729 | 5.67k | HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", |
730 | 5.67k | af->getlinenum()); |
731 | 10.2k | len >>= 1; |
732 | 10.2k | *result = new unsigned short[len]; |
733 | 153k | for (int i = 0; i < len; i++) { |
734 | 143k | unsigned short flag = ((unsigned short)((unsigned char)flags[i << 1]) << 8) | |
735 | 143k | ((unsigned short)((unsigned char)flags[(i << 1) | 1])); |
736 | | |
737 | 143k | if (flag >= DEFAULTFLAGS && af != NULL) { |
738 | 9.40k | HUNSPELL_WARNING(stderr, |
739 | 9.40k | "error: line %d: flag id %d is too large (max: %d)\n", |
740 | 9.40k | af->getlinenum(), flag, DEFAULTFLAGS - 1); |
741 | 9.40k | flag = 0; |
742 | 9.40k | } |
743 | | |
744 | 143k | (*result)[i] = flag; |
745 | 143k | } |
746 | 10.2k | break; |
747 | 0 | } |
748 | 4.50k | case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 |
749 | | // 23 233) |
750 | 3.32M | len = int(1 + std::count_if(flags.begin(), flags.end(), [](char c) { return c == ','; })); |
751 | 4.50k | *result = new unsigned short[len]; |
752 | 4.50k | unsigned short* dest = *result; |
753 | 4.50k | const char* src = flags.c_str(); |
754 | 3.33M | for (size_t p = 0; p < flags.size(); ++p) { |
755 | 3.32M | if (flags[p] == ',') { |
756 | 27.5k | int i = atoi(src); |
757 | 27.5k | if ((i >= DEFAULTFLAGS || i < 0) && af != NULL) { |
758 | 1.53k | HUNSPELL_WARNING( |
759 | 1.53k | stderr, "error: line %d: flag id %d is too large (max: %d)\n", |
760 | 1.53k | af->getlinenum(), i, DEFAULTFLAGS - 1); |
761 | 1.53k | i = 0; |
762 | 1.53k | } |
763 | 27.5k | *dest = (unsigned short)i; |
764 | 27.5k | if (*dest == 0 && af != NULL) |
765 | 25.3k | HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", |
766 | 25.3k | af->getlinenum()); |
767 | 27.5k | src = flags.c_str() + p + 1; |
768 | 27.5k | dest++; |
769 | 27.5k | } |
770 | 3.32M | } |
771 | 4.50k | int i = atoi(src); |
772 | 4.50k | if (i >= DEFAULTFLAGS || i < 0) { |
773 | 432 | HUNSPELL_WARNING(stderr, |
774 | 432 | "error: line %d: flag id %d is too large (max: %d)\n", |
775 | 432 | af->getlinenum(), i, DEFAULTFLAGS - 1); |
776 | 432 | i = 0; |
777 | 432 | } |
778 | 4.50k | *dest = (unsigned short)i; |
779 | 4.50k | if (*dest == 0) |
780 | 3.24k | HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", |
781 | 3.24k | af->getlinenum()); |
782 | 4.50k | break; |
783 | 0 | } |
784 | 11.5k | case FLAG_UNI: { // UTF-8 characters |
785 | 11.5k | std::vector<w_char> w; |
786 | 11.5k | u8_u16(w, flags); |
787 | 11.5k | len = w.size(); |
788 | 11.5k | *result = new unsigned short[len]; |
789 | | #if defined(__i386__) || defined(_M_IX86) || defined(_M_X64) |
790 | | memcpy(*result, w.data(), len * sizeof(unsigned short)); |
791 | | #else |
792 | 11.5k | unsigned short* dest = *result; |
793 | 433k | for (const w_char wc : w) { |
794 | 433k | *dest = (unsigned short)wc; |
795 | 433k | dest++; |
796 | 433k | } |
797 | 11.5k | #endif |
798 | 11.5k | break; |
799 | 0 | } |
800 | 73.4k | default: { // Ispell's one-character flags (erfg -> e r f g) |
801 | 73.4k | len = flags.size(); |
802 | 73.4k | *result = new unsigned short[len]; |
803 | 73.4k | unsigned short* dest = *result; |
804 | 4.93M | for (const char flag : flags) { |
805 | 4.93M | *dest = (unsigned char)flag; |
806 | 4.93M | dest++; |
807 | 4.93M | } |
808 | 73.4k | } |
809 | 99.8k | } |
810 | 99.8k | return len; |
811 | 99.8k | } |
812 | | |
813 | 68.7k | bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const { |
814 | 68.7k | if (flags.empty()) { |
815 | 91 | return false; |
816 | 91 | } |
817 | 68.6k | switch (flag_mode) { |
818 | 24.1k | case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) |
819 | 24.1k | size_t len = flags.size(); |
820 | 24.1k | if ((len & 1) == 1) |
821 | 23.4k | HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", |
822 | 23.4k | af->getlinenum()); |
823 | 24.1k | len >>= 1; |
824 | 24.1k | result.reserve(result.size() + len); |
825 | 160k | for (size_t i = 0; i < len; ++i) { |
826 | 136k | result.push_back(((unsigned short)((unsigned char)flags[i << 1]) << 8) | |
827 | 136k | ((unsigned short)((unsigned char)flags[(i << 1) | 1]))); |
828 | 136k | } |
829 | 24.1k | break; |
830 | 0 | } |
831 | 5.69k | case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 |
832 | | // 23 233) |
833 | 5.69k | const char* src = flags.c_str(); |
834 | 48.0k | for (const char* p = src; *p; p++) { |
835 | 42.3k | if (*p == ',') { |
836 | 3.13k | int i = atoi(src); |
837 | 3.13k | if (i >= DEFAULTFLAGS) { |
838 | 589 | HUNSPELL_WARNING( |
839 | 589 | stderr, "error: line %d: flag id %d is too large (max: %d)\n", |
840 | 589 | af->getlinenum(), i, DEFAULTFLAGS - 1); |
841 | 589 | i = 0; |
842 | 589 | } |
843 | 3.13k | result.push_back((unsigned short)i); |
844 | 3.13k | if (result.back() == 0) |
845 | 1.30k | HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", |
846 | 1.30k | af->getlinenum()); |
847 | 3.13k | src = p + 1; |
848 | 3.13k | } |
849 | 42.3k | } |
850 | 5.69k | int i = atoi(src); |
851 | 5.69k | if (i >= DEFAULTFLAGS) { |
852 | 394 | HUNSPELL_WARNING(stderr, |
853 | 394 | "error: line %d: flag id %d is too large (max: %d)\n", |
854 | 394 | af->getlinenum(), i, DEFAULTFLAGS - 1); |
855 | 394 | i = 0; |
856 | 394 | } |
857 | 5.69k | result.push_back((unsigned short)i); |
858 | 5.69k | if (result.back() == 0) |
859 | 4.50k | HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", |
860 | 4.50k | af->getlinenum()); |
861 | 5.69k | break; |
862 | 0 | } |
863 | 1.80k | case FLAG_UNI: { // UTF-8 characters |
864 | 1.80k | std::vector<w_char> w; |
865 | 1.80k | u8_u16(w, flags); |
866 | 1.80k | size_t len = w.size(), origsize = result.size(); |
867 | | #if defined(__i386__) || defined(_M_IX86) || defined(_M_X64) |
868 | | result.resize(origsize + len); |
869 | | memcpy(result.data() + origsize, w.data(), len * sizeof(short)); |
870 | | #else |
871 | 1.80k | result.reserve(origsize + len); |
872 | 2.33k | for (const w_char wc : w) result.push_back((unsigned short)wc); |
873 | 1.80k | #endif |
874 | 1.80k | break; |
875 | 0 | } |
876 | 36.9k | default: { // Ispell's one-character flags (erfg -> e r f g) |
877 | 36.9k | result.reserve(flags.size()); |
878 | 654k | for (const char flag : flags) { |
879 | 654k | result.push_back((unsigned char)flag); |
880 | 654k | } |
881 | 36.9k | } |
882 | 68.6k | } |
883 | 68.6k | return true; |
884 | 68.6k | } |
885 | | |
886 | 95.5k | unsigned short HashMgr::decode_flag(const std::string& f) const { |
887 | 95.5k | unsigned short s = 0; |
888 | 95.5k | int i; |
889 | 95.5k | switch (flag_mode) { |
890 | 5.64k | case FLAG_LONG: |
891 | 5.64k | if (f.size() >= 2) |
892 | 2.56k | s = ((unsigned short)((unsigned char)f[0]) << 8) | ((unsigned short)((unsigned char)f[1])); |
893 | 5.64k | break; |
894 | 1.99k | case FLAG_NUM: |
895 | 1.99k | i = atoi(f.c_str()); |
896 | 1.99k | if (i >= DEFAULTFLAGS) { |
897 | 96 | HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", |
898 | 96 | i, DEFAULTFLAGS - 1); |
899 | 96 | i = 0; |
900 | 96 | } |
901 | 1.99k | s = (unsigned short)i; |
902 | 1.99k | break; |
903 | 1.99k | case FLAG_UNI: { |
904 | 1.99k | std::vector<w_char> w; |
905 | 1.99k | u8_u16(w, f); |
906 | 1.99k | if (!w.empty()) |
907 | 1.89k | s = (unsigned short)w[0]; |
908 | 1.99k | break; |
909 | 0 | } |
910 | 85.9k | default: |
911 | 85.9k | if (!f.empty()) |
912 | 83.2k | s = (unsigned char)f[0]; |
913 | 95.5k | } |
914 | 95.5k | if (s == 0) |
915 | 12.2k | HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); |
916 | 95.5k | return s; |
917 | 95.5k | } |
918 | | |
919 | 1.13k | std::string HashMgr::encode_flag(unsigned short f) const { |
920 | 1.13k | if (f == 0) |
921 | 245 | return "(NULL)"; |
922 | 889 | std::string ch; |
923 | 889 | if (flag_mode == FLAG_LONG) { |
924 | 32 | ch.push_back((unsigned char)(f >> 8)); |
925 | 32 | ch.push_back((unsigned char)(f - ((f >> 8) << 8))); |
926 | 857 | } else if (flag_mode == FLAG_NUM) { |
927 | 11 | ch = std::to_string(f); |
928 | 846 | } else if (flag_mode == FLAG_UNI) { |
929 | | |
930 | | #if defined(__i386__) || defined(_M_IX86) || defined(_M_X64) |
931 | | |
932 | | #if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) |
933 | | auto wc = std::bit_cast<w_char>(f); |
934 | | #else |
935 | | w_char wc; |
936 | | memcpy(&wc, &f, sizeof(unsigned short)); |
937 | | #endif |
938 | | |
939 | | #else |
940 | 27 | w_char wc; |
941 | 27 | wc.h = (unsigned char)(f >> 8); |
942 | 27 | wc.l = (unsigned char)(f & 0xff); |
943 | 27 | #endif |
944 | 27 | const std::vector<w_char> w = { wc }; |
945 | 27 | u16_u8(ch, w); |
946 | 819 | } else { |
947 | 819 | ch.push_back((unsigned char)(f)); |
948 | 819 | } |
949 | 889 | return ch; |
950 | 1.13k | } |
951 | | |
952 | | // read in aff file and set flag mode |
953 | 15.2k | int HashMgr::load_config(const char* affpath, const char* key) { |
954 | 15.2k | int firstline = 1; |
955 | | |
956 | | // open the affix file |
957 | 15.2k | FileMgr* afflst = new FileMgr(affpath, key); |
958 | 15.2k | if (!afflst) { |
959 | 0 | HUNSPELL_WARNING( |
960 | 0 | stderr, "Error - could not open affix description file %s\n", affpath); |
961 | 0 | return 1; |
962 | 0 | } |
963 | | |
964 | | // read in each line ignoring any that do not |
965 | | // start with a known line type indicator |
966 | | |
967 | 15.2k | std::string line; |
968 | 639k | while (afflst->getline(line)) { |
969 | 626k | mychomp(line); |
970 | | |
971 | | /* remove byte order mark */ |
972 | 626k | if (firstline) { |
973 | 14.8k | firstline = 0; |
974 | 14.8k | if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { |
975 | 1 | line.erase(0, 3); |
976 | 1 | } |
977 | 14.8k | } |
978 | | |
979 | | /* parse in the try string */ |
980 | 626k | if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) { |
981 | 5.46k | if (flag_mode != FLAG_CHAR) { |
982 | 2.51k | HUNSPELL_WARNING(stderr, |
983 | 2.51k | "error: line %d: multiple definitions of the FLAG " |
984 | 2.51k | "affix file parameter\n", |
985 | 2.51k | afflst->getlinenum()); |
986 | 2.51k | } |
987 | 5.46k | if (line.find("long") != std::string::npos) |
988 | 1.02k | flag_mode = FLAG_LONG; |
989 | 5.46k | if (line.find("num") != std::string::npos) |
990 | 1.32k | flag_mode = FLAG_NUM; |
991 | 5.46k | if (line.find("UTF-8") != std::string::npos) |
992 | 804 | flag_mode = FLAG_UNI; |
993 | 5.46k | if (flag_mode == FLAG_CHAR) { |
994 | 1.80k | HUNSPELL_WARNING( |
995 | 1.80k | stderr, |
996 | 1.80k | "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", |
997 | 1.80k | afflst->getlinenum()); |
998 | 1.80k | } |
999 | 5.46k | } |
1000 | | |
1001 | 626k | if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { |
1002 | 1.47k | std::string st; |
1003 | 1.47k | if (!parse_string(line, st, afflst->getlinenum())) { |
1004 | 5 | delete afflst; |
1005 | 5 | return 1; |
1006 | 5 | } |
1007 | 1.47k | forbiddenword = decode_flag(st); |
1008 | 1.47k | } |
1009 | | |
1010 | 626k | if (line.compare(0, 3, "SET", 3) == 0) { |
1011 | 4.23k | if (!parse_string(line, enc, afflst->getlinenum())) { |
1012 | 141 | delete afflst; |
1013 | 141 | return 1; |
1014 | 141 | } |
1015 | 4.09k | if (enc == "UTF-8") { |
1016 | 3.45k | utf8 = 1; |
1017 | 3.45k | } else |
1018 | 642 | csconv = get_current_cs(enc); |
1019 | 4.09k | } |
1020 | | |
1021 | 625k | if (line.compare(0, 4, "LANG", 4) == 0) { |
1022 | 1.38k | if (!parse_string(line, lang, afflst->getlinenum())) { |
1023 | 35 | delete afflst; |
1024 | 35 | return 1; |
1025 | 35 | } |
1026 | 1.34k | langnum = get_lang_num(lang); |
1027 | 1.34k | } |
1028 | | |
1029 | | /* parse in the ignored characters (for example, Arabic optional diacritics |
1030 | | * characters */ |
1031 | 625k | if (line.compare(0, 6, "IGNORE", 6) == 0) { |
1032 | 1.31k | if (!parse_array(line, ignorechars, ignorechars_utf16, |
1033 | 1.31k | utf8, afflst->getlinenum())) { |
1034 | 53 | delete afflst; |
1035 | 53 | return 1; |
1036 | 53 | } |
1037 | 1.31k | } |
1038 | | |
1039 | 625k | if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) { |
1040 | 837 | if (!parse_aliasf(line, afflst)) { |
1041 | 650 | delete afflst; |
1042 | 650 | return 1; |
1043 | 650 | } |
1044 | 837 | } |
1045 | | |
1046 | 625k | if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) { |
1047 | 687 | if (!parse_aliasm(line, afflst)) { |
1048 | 377 | delete afflst; |
1049 | 377 | return 1; |
1050 | 377 | } |
1051 | 687 | } |
1052 | | |
1053 | 624k | if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) |
1054 | 4.60k | complexprefixes = 1; |
1055 | | |
1056 | | /* parse in the typical fault correcting table */ |
1057 | 624k | if (line.compare(0, 3, "REP", 3) == 0) { |
1058 | 1.10k | if (!parse_reptable(line, afflst)) { |
1059 | 1.08k | delete afflst; |
1060 | 1.08k | return 1; |
1061 | 1.08k | } |
1062 | 1.10k | } |
1063 | | |
1064 | | // don't check the full affix file, yet |
1065 | 623k | if (((line.compare(0, 3, "SFX", 3) == 0) || |
1066 | 602k | (line.compare(0, 3, "PFX", 3) == 0)) && |
1067 | 33.6k | line.size() > 3 && isspace(line[3]) && |
1068 | 15.4k | !reptable.empty()) // (REP table is in the end of Afrikaans aff file) |
1069 | 2 | break; |
1070 | 623k | } |
1071 | | |
1072 | 12.8k | delete afflst; |
1073 | 12.8k | return 0; |
1074 | 15.2k | } |
1075 | | |
1076 | | /* parse in the ALIAS table */ |
1077 | 837 | bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { |
1078 | 837 | if (!aliasf.empty()) { |
1079 | 62 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
1080 | 62 | af->getlinenum()); |
1081 | 62 | return false; |
1082 | 62 | } |
1083 | 775 | int i = 0, np = 0, numaliasf = 0; |
1084 | 775 | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
1085 | 2.85k | while (start_piece != line.end()) { |
1086 | 2.19k | switch (i) { |
1087 | 775 | case 0: { |
1088 | 775 | np++; |
1089 | 775 | break; |
1090 | 0 | } |
1091 | 766 | case 1: { |
1092 | 766 | numaliasf = atoi(std::string(start_piece, iter).c_str()); |
1093 | 766 | if (numaliasf < 1) { |
1094 | 116 | aliasf.clear(); |
1095 | 116 | aliasflen.clear(); |
1096 | 116 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
1097 | 116 | af->getlinenum()); |
1098 | 116 | return false; |
1099 | 116 | } |
1100 | 650 | aliasf.reserve(std::min(numaliasf, 16384)); |
1101 | 650 | aliasflen.reserve(std::min(numaliasf, 16384)); |
1102 | 650 | np++; |
1103 | 650 | break; |
1104 | 766 | } |
1105 | 651 | default: |
1106 | 651 | break; |
1107 | 2.19k | } |
1108 | 2.07k | ++i; |
1109 | 2.07k | start_piece = mystrsep(line, iter); |
1110 | 2.07k | } |
1111 | 659 | if (np != 2) { |
1112 | 9 | aliasf.clear(); |
1113 | 9 | aliasflen.clear(); |
1114 | 9 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
1115 | 9 | af->getlinenum()); |
1116 | 9 | return false; |
1117 | 9 | } |
1118 | | |
1119 | | /* now parse the numaliasf lines to read in the remainder of the table */ |
1120 | 5.55k | for (int j = 0; j < numaliasf; ++j) { |
1121 | 5.36k | std::string nl; |
1122 | 5.36k | unsigned short* alias = NULL; |
1123 | 5.36k | unsigned aliaslen = 0; |
1124 | 5.36k | i = 0; |
1125 | 5.36k | if (af->getline(nl)) { |
1126 | 5.16k | mychomp(nl); |
1127 | 5.16k | iter = nl.begin(); |
1128 | 5.16k | start_piece = mystrsep(nl, iter); |
1129 | 5.16k | bool errored = false; |
1130 | 22.3k | while (!errored && start_piece != nl.end()) { |
1131 | 17.2k | switch (i) { |
1132 | 5.16k | case 0: { |
1133 | 5.16k | if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { |
1134 | 239 | errored = true; |
1135 | 239 | break; |
1136 | 239 | } |
1137 | 4.92k | break; |
1138 | 5.16k | } |
1139 | 4.92k | case 1: { |
1140 | 4.90k | std::string piece(start_piece, iter); |
1141 | 4.90k | aliaslen = |
1142 | 4.90k | (unsigned short)decode_flags(&alias, piece, af); |
1143 | 4.90k | std::sort(alias, alias + aliaslen); |
1144 | 4.90k | break; |
1145 | 5.16k | } |
1146 | 7.14k | default: |
1147 | 7.14k | break; |
1148 | 17.2k | } |
1149 | 17.2k | ++i; |
1150 | 17.2k | start_piece = mystrsep(nl, iter); |
1151 | 17.2k | } |
1152 | 5.16k | } |
1153 | 5.36k | if (!alias) { |
1154 | 4.86k | for (int k = 0; k < j; ++k) { |
1155 | 4.39k | delete[] aliasf[k]; |
1156 | 4.39k | } |
1157 | 463 | aliasf.clear(); |
1158 | 463 | aliasflen.clear(); |
1159 | 463 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
1160 | 463 | af->getlinenum()); |
1161 | 463 | return false; |
1162 | 463 | } |
1163 | | |
1164 | 4.90k | aliasf.push_back(alias); |
1165 | 4.90k | aliasflen.push_back(aliaslen); |
1166 | 4.90k | } |
1167 | 187 | return true; |
1168 | 650 | } |
1169 | | |
1170 | 43.1k | int HashMgr::is_aliasf() const { |
1171 | 43.1k | return !aliasf.empty(); |
1172 | 43.1k | } |
1173 | | |
1174 | 5.67k | int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const { |
1175 | 5.67k | if (index > 0 && static_cast<size_t>(index) <= aliasflen.size()) { |
1176 | 1.79k | *fvec = aliasf[index - 1]; |
1177 | 1.79k | return aliasflen[index - 1]; |
1178 | 1.79k | } |
1179 | 3.88k | HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", |
1180 | 3.88k | af->getlinenum(), index); |
1181 | 3.88k | *fvec = NULL; |
1182 | 3.88k | return 0; |
1183 | 5.67k | } |
1184 | | |
1185 | | /* parse morph alias definitions */ |
1186 | 687 | bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) { |
1187 | 687 | if (!aliasm.empty()) { |
1188 | 64 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
1189 | 64 | af->getlinenum()); |
1190 | 64 | return false; |
1191 | 64 | } |
1192 | 623 | int i = 0, np = 0, numaliasm = 0; |
1193 | 623 | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
1194 | 2.62k | while (start_piece != line.end()) { |
1195 | 2.04k | switch (i) { |
1196 | 623 | case 0: { |
1197 | 623 | np++; |
1198 | 623 | break; |
1199 | 0 | } |
1200 | 604 | case 1: { |
1201 | 604 | numaliasm = atoi(std::string(start_piece, iter).c_str()); |
1202 | 604 | if (numaliasm < 1) { |
1203 | 42 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
1204 | 42 | af->getlinenum()); |
1205 | 42 | return false; |
1206 | 42 | } |
1207 | 562 | aliasm.reserve(std::min(numaliasm, 16384)); |
1208 | 562 | np++; |
1209 | 562 | break; |
1210 | 604 | } |
1211 | 816 | default: |
1212 | 816 | break; |
1213 | 2.04k | } |
1214 | 2.00k | ++i; |
1215 | 2.00k | start_piece = mystrsep(line, iter); |
1216 | 2.00k | } |
1217 | 581 | if (np != 2) { |
1218 | 19 | aliasm.clear(); |
1219 | 19 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
1220 | 19 | af->getlinenum()); |
1221 | 19 | return false; |
1222 | 19 | } |
1223 | | |
1224 | | /* now parse the numaliasm lines to read in the remainder of the table */ |
1225 | 2.46k | for (int j = 0; j < numaliasm; ++j) { |
1226 | 2.15k | std::string nl; |
1227 | 2.15k | char* alias = NULL; |
1228 | 2.15k | if (af->getline(nl)) { |
1229 | 2.07k | mychomp(nl); |
1230 | 2.07k | iter = nl.begin(); |
1231 | 2.07k | i = 0; |
1232 | 2.07k | start_piece = mystrsep(nl, iter); |
1233 | 2.07k | bool errored = false; |
1234 | 9.00k | while (!errored && start_piece != nl.end()) { |
1235 | 6.92k | switch (i) { |
1236 | 2.06k | case 0: { |
1237 | 2.06k | if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { |
1238 | 144 | errored = true; |
1239 | 144 | break; |
1240 | 144 | } |
1241 | 1.92k | break; |
1242 | 2.06k | } |
1243 | 1.92k | case 1: { |
1244 | | // add the remaining of the line |
1245 | 1.90k | std::string::const_iterator end = nl.end(); |
1246 | 1.90k | std::string chunk(start_piece, end); |
1247 | 1.90k | if (complexprefixes) { |
1248 | 763 | if (utf8) |
1249 | 264 | reverseword_utf(chunk); |
1250 | 499 | else |
1251 | 499 | reverseword(chunk); |
1252 | 763 | } |
1253 | 1.90k | size_t sl = chunk.size() + 1; |
1254 | 1.90k | alias = new char[sl]; |
1255 | 1.90k | memcpy(alias, chunk.c_str(), sl); |
1256 | 1.90k | break; |
1257 | 2.06k | } |
1258 | 2.95k | default: |
1259 | 2.95k | break; |
1260 | 6.92k | } |
1261 | 6.92k | ++i; |
1262 | 6.92k | start_piece = mystrsep(nl, iter); |
1263 | 6.92k | } |
1264 | 2.07k | } |
1265 | 2.15k | if (!alias) { |
1266 | 1.42k | for (int k = 0; k < j; ++k) { |
1267 | 1.17k | delete[] aliasm[k]; |
1268 | 1.17k | } |
1269 | 252 | aliasm.clear(); |
1270 | 252 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
1271 | 252 | af->getlinenum()); |
1272 | 252 | return false; |
1273 | 252 | } |
1274 | 1.90k | aliasm.push_back(alias); |
1275 | 1.90k | } |
1276 | 310 | return true; |
1277 | 562 | } |
1278 | | |
1279 | 44.1k | int HashMgr::is_aliasm() const { |
1280 | 44.1k | return !aliasm.empty(); |
1281 | 44.1k | } |
1282 | | |
1283 | 21.1k | char* HashMgr::get_aliasm(int index) const { |
1284 | 21.1k | if (index > 0 && static_cast<size_t>(index) <= aliasm.size()) |
1285 | 8.27k | return aliasm[index - 1]; |
1286 | 12.8k | HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); |
1287 | 12.8k | return NULL; |
1288 | 21.1k | } |
1289 | | |
1290 | | /* parse in the typical fault correcting table */ |
1291 | 1.10k | bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) { |
1292 | 1.10k | if (!reptable.empty()) { |
1293 | 6 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", |
1294 | 6 | af->getlinenum()); |
1295 | 6 | return false; |
1296 | 6 | } |
1297 | 1.09k | int numrep = -1, i = 0, np = 0; |
1298 | 1.09k | auto iter = line.begin(), start_piece = mystrsep(line, iter); |
1299 | 5.96k | while (start_piece != line.end()) { |
1300 | 4.95k | switch (i) { |
1301 | 1.09k | case 0: { |
1302 | 1.09k | np++; |
1303 | 1.09k | break; |
1304 | 0 | } |
1305 | 930 | case 1: { |
1306 | 930 | numrep = atoi(std::string(start_piece, iter).c_str()); |
1307 | 930 | if (numrep < 1) { |
1308 | 91 | HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", |
1309 | 91 | af->getlinenum()); |
1310 | 91 | return false; |
1311 | 91 | } |
1312 | 839 | reptable.reserve(std::min(numrep, 16384)); |
1313 | 839 | np++; |
1314 | 839 | break; |
1315 | 930 | } |
1316 | 2.92k | default: |
1317 | 2.92k | break; |
1318 | 4.95k | } |
1319 | 4.86k | ++i; |
1320 | 4.86k | start_piece = mystrsep(line, iter); |
1321 | 4.86k | } |
1322 | 1.00k | if (np != 2) { |
1323 | 169 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", |
1324 | 169 | af->getlinenum()); |
1325 | 169 | return false; |
1326 | 169 | } |
1327 | | |
1328 | | /* now parse the numrep lines to read in the remainder of the table */ |
1329 | 4.36k | for (int j = 0; j < numrep; ++j) { |
1330 | 4.34k | std::string nl; |
1331 | 4.34k | reptable.emplace_back(); |
1332 | 4.34k | int type = 0; |
1333 | 4.34k | if (af->getline(nl)) { |
1334 | 4.16k | mychomp(nl); |
1335 | 4.16k | iter = nl.begin(); |
1336 | 4.16k | i = 0; |
1337 | 4.16k | start_piece = mystrsep(nl, iter); |
1338 | 4.16k | bool errored = false; |
1339 | 18.0k | while (!errored && start_piece != nl.end()) { |
1340 | 13.8k | switch (i) { |
1341 | 4.08k | case 0: { |
1342 | 4.08k | if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { |
1343 | 481 | errored = true; |
1344 | 481 | break; |
1345 | 481 | } |
1346 | 3.59k | break; |
1347 | 4.08k | } |
1348 | 3.59k | case 1: { |
1349 | 3.57k | if (*start_piece == '^') |
1350 | 80 | type = 1; |
1351 | 3.57k | reptable.back().pattern.assign(start_piece + type, iter); |
1352 | 3.57k | mystrrep(reptable.back().pattern, "_", " "); |
1353 | 3.57k | if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { |
1354 | 2.64k | type += 2; |
1355 | 2.64k | reptable.back().pattern.resize(reptable.back().pattern.size() - 1); |
1356 | 2.64k | } |
1357 | 3.57k | break; |
1358 | 4.08k | } |
1359 | 3.53k | case 2: { |
1360 | 3.53k | reptable.back().outstrings[type].assign(start_piece, iter); |
1361 | 3.53k | mystrrep(reptable.back().outstrings[type], "_", " "); |
1362 | 3.53k | break; |
1363 | 4.08k | } |
1364 | 2.66k | default: |
1365 | 2.66k | break; |
1366 | 13.8k | } |
1367 | 13.8k | ++i; |
1368 | 13.8k | start_piece = mystrsep(nl, iter); |
1369 | 13.8k | } |
1370 | 4.16k | } |
1371 | 4.34k | if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { |
1372 | 815 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", |
1373 | 815 | af->getlinenum()); |
1374 | 815 | reptable.clear(); |
1375 | 815 | return false; |
1376 | 815 | } |
1377 | 4.34k | } |
1378 | 24 | return true; |
1379 | 839 | } |
1380 | | |
1381 | | // return replacing table |
1382 | 3.02M | const std::vector<replentry>& HashMgr::get_reptable() const { |
1383 | 3.02M | return reptable; |
1384 | 3.02M | } |