Coverage Report

Created: 2025-10-13 06:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/language.hpp
Line
Count
Source
1
// Copyright 2004 by Kevin Atkinson under the terms of the LGPL
2
3
#ifndef ASPELLER_LANGUAGE__HPP
4
#define ASPELLER_LANGUAGE__HPP
5
6
#include "affix.hpp"
7
#include "cache.hpp"
8
#include "config.hpp"
9
#include "convert.hpp"
10
#include "phonetic.hpp"
11
#include "posib_err.hpp"
12
#include "stack_ptr.hpp"
13
#include "string.hpp"
14
#include "objstack.hpp"
15
#include "string_enumeration.hpp"
16
17
#include "iostream.hpp"
18
19
using namespace acommon;
20
21
namespace acommon {
22
  struct CheckInfo;
23
24
  struct ConfigConvKey : public ConvKey {
25
    Config::Value config_val;
26
    template <typename T>
27
3.35k
    ConfigConvKey(const T & v) : config_val(v) {
28
3.35k
      val = config_val.val;
29
3.35k
      allow_ucs = config_val.secure;
30
3.35k
    }
acommon::ConfigConvKey::ConfigConvKey<acommon::PosibErr<acommon::Config::Value> >(acommon::PosibErr<acommon::Config::Value> const&)
Line
Count
Source
27
2.41k
    ConfigConvKey(const T & v) : config_val(v) {
28
2.41k
      val = config_val.val;
29
2.41k
      allow_ucs = config_val.secure;
30
2.41k
    }
acommon::ConfigConvKey::ConfigConvKey<char const*>(char const* const&)
Line
Count
Source
27
941
    ConfigConvKey(const T & v) : config_val(v) {
28
941
      val = config_val.val;
29
941
      allow_ucs = config_val.secure;
30
941
    }
31
941
    ConfigConvKey & operator=(const ConfigConvKey & other) {
32
941
      config_val = other.config_val; 
33
941
      val = config_val.val;
34
941
      allow_ucs = config_val.secure;
35
941
      return *this;
36
941
    }
37
1.20k
    void fix_encoding_str() {
38
1.20k
      String buf;
39
1.20k
      ::fix_encoding_str(val, buf);
40
1.20k
      config_val.val.swap(buf);
41
1.20k
      val = config_val.val;
42
1.20k
    }
43
  private:
44
  };
45
}
46
47
namespace aspeller {
48
49
  struct SuggestRepl {
50
    const char * substr;
51
    const char * repl;
52
  };
53
  
54
  class SuggestReplEnumeration
55
  {
56
    const SuggestRepl * i_;
57
    const SuggestRepl * end_;
58
  public:
59
    SuggestReplEnumeration(const SuggestRepl * b, const SuggestRepl * e)
60
2.46k
      : i_(b), end_(e) {}
61
0
    bool at_end() const {return i_ == end_;}
62
1.04M
    const SuggestRepl * next() {
63
1.04M
      if (i_ == end_) return 0;
64
1.04M
      return i_++;
65
1.04M
    }
66
  };
67
68
  // CharInfo
69
70
  typedef unsigned int CharInfo; // 6 bits
71
72
  static const CharInfo LOWER  = (1 << 0);
73
  static const CharInfo UPPER  = (1 << 1);
74
  static const CharInfo TITLE  = (1 << 2);
75
  static const CharInfo PLAIN  = (1 << 3);
76
  static const CharInfo LETTER = (1 << 4);
77
  static const CharInfo CLEAN  = (1 << 5);
78
79
  static const CharInfo CHAR_INFO_ALL = 0x3F;
80
81
  //
82
83
  //
84
85
  struct CompoundWord {
86
    const char * word;
87
    const char * sep;
88
    const char * rest;
89
    const char * end;
90
24.3k
    bool empty() const {return word == end;}
91
125k
    bool single() const {return rest == end;}
92
54.9k
    unsigned word_len() const {return sep - word;}
93
24.3k
    unsigned rest_offset() const {return rest - word;}
94
24.3k
    unsigned rest_len() const {return end - rest;}
95
    CompoundWord()
96
0
      : word(), sep(), rest(), end() {}
97
    CompoundWord(const char * a, const char * b)
98
95.6k
      : word(a), sep(b), rest(b), end(b) {}
99
    CompoundWord(const char * a, const char * b, const char * c)
100
54.7k
      : word(a), sep(b), rest(b), end(c) {}
101
    CompoundWord(const char * a, const char * b, const char * c, const char * d)
102
0
      : word(a), sep(b), rest(c), end(d) {}
103
  };
104
105
  enum StoreAs {Stripped, Lower};
106
107
  class Language : public Cacheable {
108
  public:
109
    typedef const Config CacheConfig;
110
    typedef String       CacheKey;
111
112
    enum CharType {Unknown, WhiteSpace, Hyphen, Digit, 
113
                   NonLetter, Modifier, Letter};
114
    
115
    struct SpecialChar {
116
      bool begin;
117
      bool middle;
118
      bool end;
119
      bool any;
120
308k
      SpecialChar() : begin(false), middle(false), end(false), any(false) {}
121
1.38k
      SpecialChar(bool b, bool m, bool e) : begin(b), middle(m), end(e),
122
1.38k
                                            any(b || m || e) {}
123
    };
124
125
  private:
126
    String   dir_;
127
    String   name_;
128
    String   charset_;
129
    String   charmap_;
130
    String   data_encoding_;
131
132
    ConvObj  mesg_conv_;
133
    ConvObj  to_utf8_;
134
    ConvObj  from_utf8_;
135
136
3.02G
    unsigned char to_uchar(char c) const {return static_cast<unsigned char>(c);}
137
138
    SpecialChar special_[256];
139
    CharInfo      char_info_[256];
140
    char          to_lower_[256];
141
    char          to_upper_[256];
142
    char          to_title_[256];
143
    char          to_stripped_[256];
144
    char          to_plain_[256];
145
    int           to_uni_[256];
146
    CharType      char_type_[256];
147
    char          to_clean_[256];
148
    char          de_accent_[256];
149
150
    StoreAs       store_as_;
151
152
    String      soundslike_chars_;
153
    String      clean_chars_;
154
155
    bool have_soundslike_;
156
    bool have_repl_;
157
158
    StackPtr<Soundslike> soundslike_;
159
    StackPtr<AffixMgr>   affix_;
160
    StackPtr<Config>     lang_config_;
161
162
    StringBuffer buf_;
163
    Vector<SuggestRepl> repls_;
164
165
    Language(const Language &);
166
    void operator=(const Language &);
167
168
  public: // but don't use
169
170
    char          sl_first_[256];
171
    char          sl_rest_[256];
172
173
  public:
174
175
1.20k
    Language() {}
176
    PosibErr<void> setup(const String & lang, const Config * config);
177
    PosibErr<void> set_lang_defaults(Config & config) const;
178
179
2.36k
    const char * data_dir() const {return dir_.c_str();}
180
36.8k
    const char * name() const {return name_.c_str();}
181
9.57k
    const char * charmap() const {return charmap_.c_str();}
182
4.79k
    const char * data_encoding() const {return data_encoding_.c_str();}
183
184
55.1k
    const Convert * mesg_conv() const {return mesg_conv_.ptr;}
185
0
    const Convert * to_utf8() const {return to_utf8_.ptr;}
186
0
    const Convert * from_utf8() const {return from_utf8_.ptr;}
187
188
27.5k
    int to_uni(char c) const {return to_uni_[to_uchar(c)];}
189
190
    //
191
    // case conversion
192
    //
193
194
950k
    char to_upper(char c) const {return to_upper_[to_uchar(c)];}
195
80.7k
    bool is_upper(char c) const {return to_upper(c) == c;}
196
197
29.0M
    char to_lower(char c) const {return to_lower_[to_uchar(c)];}
198
433k
    bool is_lower(char c) const {return to_lower(c) == c;}
199
200
298k
    char to_title(char c) const {return to_title_[to_uchar(c)];}
201
0
    bool is_title(char c) const {return to_title(c) == c;}
202
203
0
    char * to_lower(char * res, const char * str) const {
204
0
      while (*str) *res++ = to_lower(*str++); *res = '\0'; return res;}
205
128k
    char * to_upper(char * res, const char * str) const {
206
598k
      while (*str) *res++ = to_upper(*str++); *res = '\0'; return res;}
207
208
19.6k
    void to_lower(String & res, const char * str) const {
209
24.3M
      res.clear(); while (*str) res += to_lower(*str++);}
210
0
    void to_upper(String & res, const char * str) const {
211
0
      res.clear(); while (*str) res += to_upper(*str++);}
212
213
0
    bool is_lower(const char * str) const {
214
0
      while (*str) {if (!is_lower(*str++)) return false;} return true;}
215
0
    bool is_upper(const char * str) const {
216
0
      while (*str) {if (!is_upper(*str++)) return false;} return true;}
217
218
    //
219
    //
220
    //
221
222
0
    char to_plain(char c) const {return to_plain_[to_uchar(c)];}
223
224
132k
    char de_accent(char c) const {return de_accent_[to_uchar(c)];}
225
    
226
1.33M
    SpecialChar special(char c) const {return special_[to_uchar(c)];}
227
  
228
28.0M
    CharType char_type(char c) const {return char_type_[to_uchar(c)];}
229
27.7M
    bool is_alpha(char c) const {return char_type(c) >  NonLetter;}
230
231
61.6M
    CharInfo char_info(char c) const {return char_info_[to_uchar(c)];}
232
233
    //
234
    // stripped
235
    //
236
237
302k
    char to_stripped(char c) const {return to_stripped_[to_uchar(c)];}
238
239
    // return a pointer to the END of the string
240
0
    char * to_stripped(char * res, const char * str) const {
241
0
      for (; *str; ++str) {
242
0
        char c = to_stripped(*str);
243
0
        if (c) *res++ = c;
244
0
      }
245
0
      *res = '\0';
246
0
      return res;
247
0
    }
248
0
    void to_stripped(String & res, const char * str) const {
249
0
      res.clear();
250
0
      for (; *str; ++str) {
251
0
        char c = to_stripped(*str);
252
0
        if (c) res += c;
253
0
      }
254
0
    }
255
256
0
    bool is_stripped(char c) const {return to_stripped(c) == c;}
257
258
0
    bool is_stripped(const char * str) const {
259
0
      while (*str) {if (!is_stripped(*str++)) return false;} return true;}
260
261
    //
262
    // Clean
263
    //
264
    // The "clean" form is how words are indixed in the dictionary.
265
    // It will at very least convert the word to lower case.  It may
266
    // also strip accents and non-letters.
267
    //
268
269
2.89G
    char to_clean(char c) const {return to_clean_[to_uchar(c)];}
270
271
14.2M
    char * to_clean(char * res, const char * str) const {
272
102M
      for (; *str; ++str) {
273
88.0M
        char c = to_clean(*str);
274
88.0M
        if (c) *res++ = c;
275
88.0M
      }
276
14.2M
      *res = '\0';
277
14.2M
      return res;
278
14.2M
    }
279
19.6k
    void to_clean(String & res, const char * str) const {
280
19.6k
      res.clear();
281
24.3M
      for (; *str; ++str) {
282
24.3M
        char c = to_clean(*str);
283
24.3M
        if (c) res += c;
284
24.3M
      }
285
19.6k
    }
286
287
0
    bool is_clean(char c) const {return to_clean(c) == c;}
288
289
0
    bool is_clean(const char * str) const {
290
0
      while (*str) {if (!is_clean(*str++)) return false;} return true;}
291
292
0
    bool is_clean_wi(WordInfo wi) const {
293
0
      return false;
294
0
      //return wi & CASE_PATTEN == AllLower && 
295
0
    }
296
297
298
18.8k
    const char * clean_chars() const {return clean_chars_.c_str();}
299
300
    //
301
    // Soundslike
302
    // 
303
  
304
4.77k
    bool have_soundslike() const {return have_soundslike_;}
305
    
306
2.37k
    const char * soundslike_name() const {return soundslike_->name();}
307
2.37k
    const char * soundslike_version() const {return soundslike_->version();}
308
309
19.6k
    void to_soundslike(String & res, ParmStr word) const {
310
19.6k
      res.resize(word.size());
311
19.6k
      char * e = soundslike_->to_soundslike(res.data(), word.str(), word.size());
312
19.6k
      res.resize(e - res.data());
313
19.6k
    }
314
    
315
    // returns a pointer to the END of the string
316
663k
    char * to_soundslike(char * res, const char * str, int len = -1) const { 
317
663k
      return soundslike_->to_soundslike(res,str,len);
318
663k
    }
319
320
3.73M
    char * to_soundslike(char * res, const char * str, int len, WordInfo wi) const {
321
3.73M
      if (!have_soundslike_ && (wi & ALL_CLEAN)) return 0;
322
3.73M
      else return soundslike_->to_soundslike(res,str,len);
323
3.73M
    }
324
325
0
    const char * soundslike_chars() const {return soundslike_chars_.c_str();}
326
327
    //
328
    // Affix compression methods
329
    //
330
331
1.57M
    const AffixMgr * affix() const {return affix_;}
332
333
0
    bool have_affix() const {return affix_;}
334
335
2.49k
    void munch(ParmStr word, GuessInfo * cl, bool cross = true) const {
336
2.49k
      if (affix_)
337
2.49k
        affix_->munch(word, cl, cross);
338
2.49k
    }
339
      
340
    WordAff * expand(ParmStr word, ParmStr aff, 
341
0
                     ObjStack & buf, int limit = INT_MAX) const {
342
0
      if (affix_)
343
0
        return affix_->expand(word, aff, buf, limit);
344
0
      else
345
0
        return fake_expand(word, aff, buf);
346
0
    }
347
    WordAff * fake_expand(ParmStr word, ParmStr aff, ObjStack & buf) const;
348
349
    //
350
    // Repl
351
    //
352
353
1.18k
    bool have_repl() const {return have_repl_;}
354
355
2.46k
    SuggestReplEnumeration * repl() const {
356
2.46k
      return new SuggestReplEnumeration(repls_.pbegin(), repls_.pend());}
357
    
358
    //
359
    //
360
    //
361
362
    WordInfo get_word_info(ParmStr str) const;
363
    
364
    //
365
    // fix_case
366
    //
367
368
    CasePattern case_pattern(ParmStr str) const;
369
370
    CasePattern case_pattern(const char * str, unsigned size) const;
371
372
    void fix_case(CasePattern case_pattern, char * str)
373
0
    {
374
0
      if (!str[0]) return;
375
0
      if (case_pattern == AllUpper) to_upper(str,str);
376
0
      else if (case_pattern == FirstUpper) *str = to_title(*str);
377
0
    }
378
    void fix_case(CasePattern case_pattern, 
379
                  char * res, const char * str) const;
380
    const char * fix_case(CasePattern case_pattern, 
381
                          const char * str, String & buf) const;
382
383
    //
384
    //
385
    //
386
387
    CompoundWord split_word(const char * str, unsigned size, bool camel_case) const;
388
389
    //
390
    // for cache
391
    //
392
393
1.20k
    static inline PosibErr<Language *> get_new(const String & lang, const Config * config) {
394
1.20k
      StackPtr<Language> l(new Language());
395
1.20k
      RET_ON_ERR(l->setup(lang, config));
396
1.20k
      return l.release();
397
1.20k
    }
398
399
7.13k
    bool cache_key_eq(const String & l) const  {return name_ == l;}
400
  };
401
402
  typedef Language LangImpl;
403
404
  struct MsgConv : public ConvP
405
  {
406
24
    MsgConv(const Language * l) : ConvP(l->mesg_conv()) {}
407
55.0k
    MsgConv(const Language & l) : ConvP(l.mesg_conv()) {}
408
  };
409
410
  struct InsensitiveCompare {
411
    // compares to strings without regards to casing or special characters
412
    const Language * lang;
413
66.9k
    InsensitiveCompare(const Language * l = 0) : lang(l) {}
414
0
    operator bool () const {return lang;}
415
    int operator() (const char * a, const char * b) const
416
63.5M
    { 
417
63.5M
      char x, y;
418
63.5M
      for (;;)
419
68.0M
      {
420
68.1M
        while (x = lang->to_clean(*a++), !x);
421
86.4M
        while (y = lang->to_clean(*b++), !y);
422
68.0M
        if (x == 0x10 || y == 0x10 || x != y) break;
423
68.0M
      }
424
63.5M
      return static_cast<unsigned char>(x) - static_cast<unsigned char>(y);
425
63.5M
    }
426
  };
427
428
  struct InsensitiveEqual {
429
    InsensitiveCompare cmp;
430
5.96k
    InsensitiveEqual(const Language * l = 0) : cmp(l) {}
431
    bool operator() (const char * a, const char * b) const
432
63.2M
    {
433
63.2M
      return cmp(a,b) == 0;
434
63.2M
    }
435
  };
436
  
437
  template <typename HASH_INT = size_t>
438
  struct InsensitiveHash {
439
    // hashes a string without regards to casing or special begin
440
    // or end characters
441
    const Language * lang;
442
2.37k
    InsensitiveHash() {}
443
    InsensitiveHash(const Language * l)
444
3.59k
  : lang(l) {}
445
    HASH_INT operator() (const char * s) const
446
54.4M
    {
447
54.4M
      HASH_INT h = 0;
448
2.68G
      for (;;) {
449
2.68G
  if (*s == 0) break;
450
2.63G
        unsigned char c = lang->to_clean(*s++);
451
2.63G
  if (c) h=5*h + c;
452
2.63G
      }
453
54.4M
      return h;
454
54.4M
    }
455
  };
456
457
  struct SensitiveCompare {
458
    const Language * lang;
459
    bool case_insensitive;
460
    bool ignore_accents; // unused
461
    bool begin; // if not begin we are checking the end of the word
462
    bool end;   // if not end we are checking the beginning of the word
463
                // if both false we are checking the middle of a word
464
    SensitiveCompare(const Language * l = 0) 
465
4.87k
      : lang(l), case_insensitive(false), ignore_accents(false),
466
4.87k
        begin(true), end(true) {}
467
    bool operator() (const char * word, const char * inlist) const;
468
  };
469
470
  struct CleanAffix {
471
    const Language * lang;
472
    OStream * log;
473
    MsgConv msgconv1;
474
    MsgConv msgconv2;
475
    CleanAffix(const Language * lang0, OStream * log0);
476
    char * operator() (ParmStr word, char * aff);
477
  };
478
479
  class WordListIterator
480
  {
481
  public:
482
    struct Value {
483
      SimpleString word;
484
      SimpleString aff;
485
    };
486
    WordListIterator(StringEnumeration * in,
487
                     const Language * lang,
488
                     OStream * log);
489
    // init may set "norm-strict" to true which is why it is not const
490
    PosibErr<void> init (Config & config);
491
    // init_plain initialized the iterator to read in a plain word
492
    // list without any affix flags, for simplicity it will expect the
493
    // input to be utf-8.  It will also assume clean the words unless
494
    // the `clean-words` option is explicitly specified.  Like init it
495
    // may set "norm-strict" to true which is why it is not const
496
    PosibErr<void> init_plain (Config & config);
497
0
    const Value & operator*() const {return val;}
498
0
    const Value * operator-> () const {return &val;}
499
    PosibErr<bool> adv();
500
  private:
501
    bool have_affix;
502
    bool validate_words;
503
    bool validate_affixes;
504
    bool clean_words;
505
    bool skip_invalid_words;
506
    bool clean_affixes;
507
    StringEnumeration * in;
508
    const Language * lang;
509
    ConvEC iconv;
510
    OStream * log;
511
    Value val;
512
    String data;
513
    const char * orig;
514
    char * str;
515
    char * str_end;
516
    CleanAffix clean_affix;
517
  };
518
519
  String get_stripped_chars(const Language & l);
520
521
  String get_clean_chars(const Language & l);
522
  
523
  PosibErr<void> check_if_sane(const Language & l, ParmStr word);
524
  PosibErr<void> check_if_valid(const Language & l, ParmStr word);
525
  PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff);
526
527
  bool find_language(Config & c);
528
529
  PosibErr<Language *> new_language(const Config &, ParmStr lang = 0);
530
531
  PosibErr<void> open_affix_file(const Config &, FStream & o);
532
}
533
534
535
#endif