/src/aspell/modules/speller/default/language.hpp
Line | Count | Source |
1 | | // Copyright 2004 by Kevin Atkinson under the terms of the LGPL |
2 | | |
3 | | #ifndef ASPELLER_LANGUAGE__HPP |
4 | | #define ASPELLER_LANGUAGE__HPP |
5 | | |
6 | | #include "affix.hpp" |
7 | | #include "cache.hpp" |
8 | | #include "config.hpp" |
9 | | #include "convert.hpp" |
10 | | #include "phonetic.hpp" |
11 | | #include "posib_err.hpp" |
12 | | #include "stack_ptr.hpp" |
13 | | #include "string.hpp" |
14 | | #include "objstack.hpp" |
15 | | #include "string_enumeration.hpp" |
16 | | |
17 | | #include "iostream.hpp" |
18 | | |
19 | | using namespace acommon; |
20 | | |
21 | | namespace acommon { |
22 | | struct CheckInfo; |
23 | | |
24 | | struct ConfigConvKey : public ConvKey { |
25 | | Config::Value config_val; |
26 | | template <typename T> |
27 | 3.35k | ConfigConvKey(const T & v) : config_val(v) { |
28 | 3.35k | val = config_val.val; |
29 | 3.35k | allow_ucs = config_val.secure; |
30 | 3.35k | } acommon::ConfigConvKey::ConfigConvKey<acommon::PosibErr<acommon::Config::Value> >(acommon::PosibErr<acommon::Config::Value> const&) Line | Count | Source | 27 | 2.41k | ConfigConvKey(const T & v) : config_val(v) { | 28 | 2.41k | val = config_val.val; | 29 | 2.41k | allow_ucs = config_val.secure; | 30 | 2.41k | } |
acommon::ConfigConvKey::ConfigConvKey<char const*>(char const* const&) Line | Count | Source | 27 | 941 | ConfigConvKey(const T & v) : config_val(v) { | 28 | 941 | val = config_val.val; | 29 | 941 | allow_ucs = config_val.secure; | 30 | 941 | } |
|
31 | 941 | ConfigConvKey & operator=(const ConfigConvKey & other) { |
32 | 941 | config_val = other.config_val; |
33 | 941 | val = config_val.val; |
34 | 941 | allow_ucs = config_val.secure; |
35 | 941 | return *this; |
36 | 941 | } |
37 | 1.20k | void fix_encoding_str() { |
38 | 1.20k | String buf; |
39 | 1.20k | ::fix_encoding_str(val, buf); |
40 | 1.20k | config_val.val.swap(buf); |
41 | 1.20k | val = config_val.val; |
42 | 1.20k | } |
43 | | private: |
44 | | }; |
45 | | } |
46 | | |
47 | | namespace aspeller { |
48 | | |
49 | | struct SuggestRepl { |
50 | | const char * substr; |
51 | | const char * repl; |
52 | | }; |
53 | | |
54 | | class SuggestReplEnumeration |
55 | | { |
56 | | const SuggestRepl * i_; |
57 | | const SuggestRepl * end_; |
58 | | public: |
59 | | SuggestReplEnumeration(const SuggestRepl * b, const SuggestRepl * e) |
60 | 2.46k | : i_(b), end_(e) {} |
61 | 0 | bool at_end() const {return i_ == end_;} |
62 | 1.04M | const SuggestRepl * next() { |
63 | 1.04M | if (i_ == end_) return 0; |
64 | 1.04M | return i_++; |
65 | 1.04M | } |
66 | | }; |
67 | | |
68 | | // CharInfo |
69 | | |
70 | | typedef unsigned int CharInfo; // 6 bits |
71 | | |
72 | | static const CharInfo LOWER = (1 << 0); |
73 | | static const CharInfo UPPER = (1 << 1); |
74 | | static const CharInfo TITLE = (1 << 2); |
75 | | static const CharInfo PLAIN = (1 << 3); |
76 | | static const CharInfo LETTER = (1 << 4); |
77 | | static const CharInfo CLEAN = (1 << 5); |
78 | | |
79 | | static const CharInfo CHAR_INFO_ALL = 0x3F; |
80 | | |
81 | | // |
82 | | |
83 | | // |
84 | | |
85 | | struct CompoundWord { |
86 | | const char * word; |
87 | | const char * sep; |
88 | | const char * rest; |
89 | | const char * end; |
90 | 24.3k | bool empty() const {return word == end;} |
91 | 125k | bool single() const {return rest == end;} |
92 | 54.9k | unsigned word_len() const {return sep - word;} |
93 | 24.3k | unsigned rest_offset() const {return rest - word;} |
94 | 24.3k | unsigned rest_len() const {return end - rest;} |
95 | | CompoundWord() |
96 | 0 | : word(), sep(), rest(), end() {} |
97 | | CompoundWord(const char * a, const char * b) |
98 | 95.6k | : word(a), sep(b), rest(b), end(b) {} |
99 | | CompoundWord(const char * a, const char * b, const char * c) |
100 | 54.7k | : word(a), sep(b), rest(b), end(c) {} |
101 | | CompoundWord(const char * a, const char * b, const char * c, const char * d) |
102 | 0 | : word(a), sep(b), rest(c), end(d) {} |
103 | | }; |
104 | | |
105 | | enum StoreAs {Stripped, Lower}; |
106 | | |
107 | | class Language : public Cacheable { |
108 | | public: |
109 | | typedef const Config CacheConfig; |
110 | | typedef String CacheKey; |
111 | | |
112 | | enum CharType {Unknown, WhiteSpace, Hyphen, Digit, |
113 | | NonLetter, Modifier, Letter}; |
114 | | |
115 | | struct SpecialChar { |
116 | | bool begin; |
117 | | bool middle; |
118 | | bool end; |
119 | | bool any; |
120 | 308k | SpecialChar() : begin(false), middle(false), end(false), any(false) {} |
121 | 1.38k | SpecialChar(bool b, bool m, bool e) : begin(b), middle(m), end(e), |
122 | 1.38k | any(b || m || e) {} |
123 | | }; |
124 | | |
125 | | private: |
126 | | String dir_; |
127 | | String name_; |
128 | | String charset_; |
129 | | String charmap_; |
130 | | String data_encoding_; |
131 | | |
132 | | ConvObj mesg_conv_; |
133 | | ConvObj to_utf8_; |
134 | | ConvObj from_utf8_; |
135 | | |
136 | 3.02G | unsigned char to_uchar(char c) const {return static_cast<unsigned char>(c);} |
137 | | |
138 | | SpecialChar special_[256]; |
139 | | CharInfo char_info_[256]; |
140 | | char to_lower_[256]; |
141 | | char to_upper_[256]; |
142 | | char to_title_[256]; |
143 | | char to_stripped_[256]; |
144 | | char to_plain_[256]; |
145 | | int to_uni_[256]; |
146 | | CharType char_type_[256]; |
147 | | char to_clean_[256]; |
148 | | char de_accent_[256]; |
149 | | |
150 | | StoreAs store_as_; |
151 | | |
152 | | String soundslike_chars_; |
153 | | String clean_chars_; |
154 | | |
155 | | bool have_soundslike_; |
156 | | bool have_repl_; |
157 | | |
158 | | StackPtr<Soundslike> soundslike_; |
159 | | StackPtr<AffixMgr> affix_; |
160 | | StackPtr<Config> lang_config_; |
161 | | |
162 | | StringBuffer buf_; |
163 | | Vector<SuggestRepl> repls_; |
164 | | |
165 | | Language(const Language &); |
166 | | void operator=(const Language &); |
167 | | |
168 | | public: // but don't use |
169 | | |
170 | | char sl_first_[256]; |
171 | | char sl_rest_[256]; |
172 | | |
173 | | public: |
174 | | |
175 | 1.20k | Language() {} |
176 | | PosibErr<void> setup(const String & lang, const Config * config); |
177 | | PosibErr<void> set_lang_defaults(Config & config) const; |
178 | | |
179 | 2.36k | const char * data_dir() const {return dir_.c_str();} |
180 | 36.8k | const char * name() const {return name_.c_str();} |
181 | 9.57k | const char * charmap() const {return charmap_.c_str();} |
182 | 4.79k | const char * data_encoding() const {return data_encoding_.c_str();} |
183 | | |
184 | 55.1k | const Convert * mesg_conv() const {return mesg_conv_.ptr;} |
185 | 0 | const Convert * to_utf8() const {return to_utf8_.ptr;} |
186 | 0 | const Convert * from_utf8() const {return from_utf8_.ptr;} |
187 | | |
188 | 27.5k | int to_uni(char c) const {return to_uni_[to_uchar(c)];} |
189 | | |
190 | | // |
191 | | // case conversion |
192 | | // |
193 | | |
194 | 950k | char to_upper(char c) const {return to_upper_[to_uchar(c)];} |
195 | 80.7k | bool is_upper(char c) const {return to_upper(c) == c;} |
196 | | |
197 | 29.0M | char to_lower(char c) const {return to_lower_[to_uchar(c)];} |
198 | 433k | bool is_lower(char c) const {return to_lower(c) == c;} |
199 | | |
200 | 298k | char to_title(char c) const {return to_title_[to_uchar(c)];} |
201 | 0 | bool is_title(char c) const {return to_title(c) == c;} |
202 | | |
203 | 0 | char * to_lower(char * res, const char * str) const { |
204 | 0 | while (*str) *res++ = to_lower(*str++); *res = '\0'; return res;} |
205 | 128k | char * to_upper(char * res, const char * str) const { |
206 | 598k | while (*str) *res++ = to_upper(*str++); *res = '\0'; return res;} |
207 | | |
208 | 19.6k | void to_lower(String & res, const char * str) const { |
209 | 24.3M | res.clear(); while (*str) res += to_lower(*str++);} |
210 | 0 | void to_upper(String & res, const char * str) const { |
211 | 0 | res.clear(); while (*str) res += to_upper(*str++);} |
212 | | |
213 | 0 | bool is_lower(const char * str) const { |
214 | 0 | while (*str) {if (!is_lower(*str++)) return false;} return true;} |
215 | 0 | bool is_upper(const char * str) const { |
216 | 0 | while (*str) {if (!is_upper(*str++)) return false;} return true;} |
217 | | |
218 | | // |
219 | | // |
220 | | // |
221 | | |
222 | 0 | char to_plain(char c) const {return to_plain_[to_uchar(c)];} |
223 | | |
224 | 132k | char de_accent(char c) const {return de_accent_[to_uchar(c)];} |
225 | | |
226 | 1.33M | SpecialChar special(char c) const {return special_[to_uchar(c)];} |
227 | | |
228 | 28.0M | CharType char_type(char c) const {return char_type_[to_uchar(c)];} |
229 | 27.7M | bool is_alpha(char c) const {return char_type(c) > NonLetter;} |
230 | | |
231 | 61.6M | CharInfo char_info(char c) const {return char_info_[to_uchar(c)];} |
232 | | |
233 | | // |
234 | | // stripped |
235 | | // |
236 | | |
237 | 302k | char to_stripped(char c) const {return to_stripped_[to_uchar(c)];} |
238 | | |
239 | | // return a pointer to the END of the string |
240 | 0 | char * to_stripped(char * res, const char * str) const { |
241 | 0 | for (; *str; ++str) { |
242 | 0 | char c = to_stripped(*str); |
243 | 0 | if (c) *res++ = c; |
244 | 0 | } |
245 | 0 | *res = '\0'; |
246 | 0 | return res; |
247 | 0 | } |
248 | 0 | void to_stripped(String & res, const char * str) const { |
249 | 0 | res.clear(); |
250 | 0 | for (; *str; ++str) { |
251 | 0 | char c = to_stripped(*str); |
252 | 0 | if (c) res += c; |
253 | 0 | } |
254 | 0 | } |
255 | | |
256 | 0 | bool is_stripped(char c) const {return to_stripped(c) == c;} |
257 | | |
258 | 0 | bool is_stripped(const char * str) const { |
259 | 0 | while (*str) {if (!is_stripped(*str++)) return false;} return true;} |
260 | | |
261 | | // |
262 | | // Clean |
263 | | // |
264 | | // The "clean" form is how words are indixed in the dictionary. |
265 | | // It will at very least convert the word to lower case. It may |
266 | | // also strip accents and non-letters. |
267 | | // |
268 | | |
269 | 2.89G | char to_clean(char c) const {return to_clean_[to_uchar(c)];} |
270 | | |
271 | 14.2M | char * to_clean(char * res, const char * str) const { |
272 | 102M | for (; *str; ++str) { |
273 | 88.0M | char c = to_clean(*str); |
274 | 88.0M | if (c) *res++ = c; |
275 | 88.0M | } |
276 | 14.2M | *res = '\0'; |
277 | 14.2M | return res; |
278 | 14.2M | } |
279 | 19.6k | void to_clean(String & res, const char * str) const { |
280 | 19.6k | res.clear(); |
281 | 24.3M | for (; *str; ++str) { |
282 | 24.3M | char c = to_clean(*str); |
283 | 24.3M | if (c) res += c; |
284 | 24.3M | } |
285 | 19.6k | } |
286 | | |
287 | 0 | bool is_clean(char c) const {return to_clean(c) == c;} |
288 | | |
289 | 0 | bool is_clean(const char * str) const { |
290 | 0 | while (*str) {if (!is_clean(*str++)) return false;} return true;} |
291 | | |
292 | 0 | bool is_clean_wi(WordInfo wi) const { |
293 | 0 | return false; |
294 | 0 | //return wi & CASE_PATTEN == AllLower && |
295 | 0 | } |
296 | | |
297 | | |
298 | 18.8k | const char * clean_chars() const {return clean_chars_.c_str();} |
299 | | |
300 | | // |
301 | | // Soundslike |
302 | | // |
303 | | |
304 | 4.77k | bool have_soundslike() const {return have_soundslike_;} |
305 | | |
306 | 2.37k | const char * soundslike_name() const {return soundslike_->name();} |
307 | 2.37k | const char * soundslike_version() const {return soundslike_->version();} |
308 | | |
309 | 19.6k | void to_soundslike(String & res, ParmStr word) const { |
310 | 19.6k | res.resize(word.size()); |
311 | 19.6k | char * e = soundslike_->to_soundslike(res.data(), word.str(), word.size()); |
312 | 19.6k | res.resize(e - res.data()); |
313 | 19.6k | } |
314 | | |
315 | | // returns a pointer to the END of the string |
316 | 663k | char * to_soundslike(char * res, const char * str, int len = -1) const { |
317 | 663k | return soundslike_->to_soundslike(res,str,len); |
318 | 663k | } |
319 | | |
320 | 3.73M | char * to_soundslike(char * res, const char * str, int len, WordInfo wi) const { |
321 | 3.73M | if (!have_soundslike_ && (wi & ALL_CLEAN)) return 0; |
322 | 3.73M | else return soundslike_->to_soundslike(res,str,len); |
323 | 3.73M | } |
324 | | |
325 | 0 | const char * soundslike_chars() const {return soundslike_chars_.c_str();} |
326 | | |
327 | | // |
328 | | // Affix compression methods |
329 | | // |
330 | | |
331 | 1.57M | const AffixMgr * affix() const {return affix_;} |
332 | | |
333 | 0 | bool have_affix() const {return affix_;} |
334 | | |
335 | 2.49k | void munch(ParmStr word, GuessInfo * cl, bool cross = true) const { |
336 | 2.49k | if (affix_) |
337 | 2.49k | affix_->munch(word, cl, cross); |
338 | 2.49k | } |
339 | | |
340 | | WordAff * expand(ParmStr word, ParmStr aff, |
341 | 0 | ObjStack & buf, int limit = INT_MAX) const { |
342 | 0 | if (affix_) |
343 | 0 | return affix_->expand(word, aff, buf, limit); |
344 | 0 | else |
345 | 0 | return fake_expand(word, aff, buf); |
346 | 0 | } |
347 | | WordAff * fake_expand(ParmStr word, ParmStr aff, ObjStack & buf) const; |
348 | | |
349 | | // |
350 | | // Repl |
351 | | // |
352 | | |
353 | 1.18k | bool have_repl() const {return have_repl_;} |
354 | | |
355 | 2.46k | SuggestReplEnumeration * repl() const { |
356 | 2.46k | return new SuggestReplEnumeration(repls_.pbegin(), repls_.pend());} |
357 | | |
358 | | // |
359 | | // |
360 | | // |
361 | | |
362 | | WordInfo get_word_info(ParmStr str) const; |
363 | | |
364 | | // |
365 | | // fix_case |
366 | | // |
367 | | |
368 | | CasePattern case_pattern(ParmStr str) const; |
369 | | |
370 | | CasePattern case_pattern(const char * str, unsigned size) const; |
371 | | |
372 | | void fix_case(CasePattern case_pattern, char * str) |
373 | 0 | { |
374 | 0 | if (!str[0]) return; |
375 | 0 | if (case_pattern == AllUpper) to_upper(str,str); |
376 | 0 | else if (case_pattern == FirstUpper) *str = to_title(*str); |
377 | 0 | } |
378 | | void fix_case(CasePattern case_pattern, |
379 | | char * res, const char * str) const; |
380 | | const char * fix_case(CasePattern case_pattern, |
381 | | const char * str, String & buf) const; |
382 | | |
383 | | // |
384 | | // |
385 | | // |
386 | | |
387 | | CompoundWord split_word(const char * str, unsigned size, bool camel_case) const; |
388 | | |
389 | | // |
390 | | // for cache |
391 | | // |
392 | | |
393 | 1.20k | static inline PosibErr<Language *> get_new(const String & lang, const Config * config) { |
394 | 1.20k | StackPtr<Language> l(new Language()); |
395 | 1.20k | RET_ON_ERR(l->setup(lang, config)); |
396 | 1.20k | return l.release(); |
397 | 1.20k | } |
398 | | |
399 | 7.13k | bool cache_key_eq(const String & l) const {return name_ == l;} |
400 | | }; |
401 | | |
402 | | typedef Language LangImpl; |
403 | | |
404 | | struct MsgConv : public ConvP |
405 | | { |
406 | 24 | MsgConv(const Language * l) : ConvP(l->mesg_conv()) {} |
407 | 55.0k | MsgConv(const Language & l) : ConvP(l.mesg_conv()) {} |
408 | | }; |
409 | | |
410 | | struct InsensitiveCompare { |
411 | | // compares to strings without regards to casing or special characters |
412 | | const Language * lang; |
413 | 66.9k | InsensitiveCompare(const Language * l = 0) : lang(l) {} |
414 | 0 | operator bool () const {return lang;} |
415 | | int operator() (const char * a, const char * b) const |
416 | 63.5M | { |
417 | 63.5M | char x, y; |
418 | 63.5M | for (;;) |
419 | 68.0M | { |
420 | 68.1M | while (x = lang->to_clean(*a++), !x); |
421 | 86.4M | while (y = lang->to_clean(*b++), !y); |
422 | 68.0M | if (x == 0x10 || y == 0x10 || x != y) break; |
423 | 68.0M | } |
424 | 63.5M | return static_cast<unsigned char>(x) - static_cast<unsigned char>(y); |
425 | 63.5M | } |
426 | | }; |
427 | | |
428 | | struct InsensitiveEqual { |
429 | | InsensitiveCompare cmp; |
430 | 5.96k | InsensitiveEqual(const Language * l = 0) : cmp(l) {} |
431 | | bool operator() (const char * a, const char * b) const |
432 | 63.2M | { |
433 | 63.2M | return cmp(a,b) == 0; |
434 | 63.2M | } |
435 | | }; |
436 | | |
437 | | template <typename HASH_INT = size_t> |
438 | | struct InsensitiveHash { |
439 | | // hashes a string without regards to casing or special begin |
440 | | // or end characters |
441 | | const Language * lang; |
442 | 2.37k | InsensitiveHash() {} |
443 | | InsensitiveHash(const Language * l) |
444 | 3.59k | : lang(l) {} |
445 | | HASH_INT operator() (const char * s) const |
446 | 54.4M | { |
447 | 54.4M | HASH_INT h = 0; |
448 | 2.68G | for (;;) { |
449 | 2.68G | if (*s == 0) break; |
450 | 2.63G | unsigned char c = lang->to_clean(*s++); |
451 | 2.63G | if (c) h=5*h + c; |
452 | 2.63G | } |
453 | 54.4M | return h; |
454 | 54.4M | } |
455 | | }; |
456 | | |
457 | | struct SensitiveCompare { |
458 | | const Language * lang; |
459 | | bool case_insensitive; |
460 | | bool ignore_accents; // unused |
461 | | bool begin; // if not begin we are checking the end of the word |
462 | | bool end; // if not end we are checking the beginning of the word |
463 | | // if both false we are checking the middle of a word |
464 | | SensitiveCompare(const Language * l = 0) |
465 | 4.87k | : lang(l), case_insensitive(false), ignore_accents(false), |
466 | 4.87k | begin(true), end(true) {} |
467 | | bool operator() (const char * word, const char * inlist) const; |
468 | | }; |
469 | | |
470 | | struct CleanAffix { |
471 | | const Language * lang; |
472 | | OStream * log; |
473 | | MsgConv msgconv1; |
474 | | MsgConv msgconv2; |
475 | | CleanAffix(const Language * lang0, OStream * log0); |
476 | | char * operator() (ParmStr word, char * aff); |
477 | | }; |
478 | | |
479 | | class WordListIterator |
480 | | { |
481 | | public: |
482 | | struct Value { |
483 | | SimpleString word; |
484 | | SimpleString aff; |
485 | | }; |
486 | | WordListIterator(StringEnumeration * in, |
487 | | const Language * lang, |
488 | | OStream * log); |
489 | | // init may set "norm-strict" to true which is why it is not const |
490 | | PosibErr<void> init (Config & config); |
491 | | // init_plain initialized the iterator to read in a plain word |
492 | | // list without any affix flags, for simplicity it will expect the |
493 | | // input to be utf-8. It will also assume clean the words unless |
494 | | // the `clean-words` option is explicitly specified. Like init it |
495 | | // may set "norm-strict" to true which is why it is not const |
496 | | PosibErr<void> init_plain (Config & config); |
497 | 0 | const Value & operator*() const {return val;} |
498 | 0 | const Value * operator-> () const {return &val;} |
499 | | PosibErr<bool> adv(); |
500 | | private: |
501 | | bool have_affix; |
502 | | bool validate_words; |
503 | | bool validate_affixes; |
504 | | bool clean_words; |
505 | | bool skip_invalid_words; |
506 | | bool clean_affixes; |
507 | | StringEnumeration * in; |
508 | | const Language * lang; |
509 | | ConvEC iconv; |
510 | | OStream * log; |
511 | | Value val; |
512 | | String data; |
513 | | const char * orig; |
514 | | char * str; |
515 | | char * str_end; |
516 | | CleanAffix clean_affix; |
517 | | }; |
518 | | |
519 | | String get_stripped_chars(const Language & l); |
520 | | |
521 | | String get_clean_chars(const Language & l); |
522 | | |
523 | | PosibErr<void> check_if_sane(const Language & l, ParmStr word); |
524 | | PosibErr<void> check_if_valid(const Language & l, ParmStr word); |
525 | | PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff); |
526 | | |
527 | | bool find_language(Config & c); |
528 | | |
529 | | PosibErr<Language *> new_language(const Config &, ParmStr lang = 0); |
530 | | |
531 | | PosibErr<void> open_affix_file(const Config &, FStream & o); |
532 | | } |
533 | | |
534 | | |
535 | | #endif |