Coverage Report

Created: 2025-10-10 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/language.cpp
Line
Count
Source
1
// Copyright 2000 by Kevin Atkinson under the terms of the LGPL
2
3
#include "settings.h"
4
5
#include <vector>
6
#include <assert.h>
7
8
#include <iostream.hpp>
9
10
#include "asc_ctype.hpp"
11
#include "clone_ptr-t.hpp"
12
#include "config.hpp"
13
#include "enumeration.hpp"
14
#include "errors.hpp"
15
#include "file_data_util.hpp"
16
#include "fstream.hpp"
17
#include "language.hpp"
18
#include "string.hpp"
19
#include "cache-t.hpp"
20
#include "getdata.hpp"
21
#include "file_util.hpp"
22
23
#ifdef ENABLE_NLS
24
#  include <langinfo.h>
25
#endif
26
27
#include "gettext.h"
28
29
namespace aspeller {
30
31
  static const char TO_CHAR_TYPE[256] = {
32
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F 
33
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
34
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
35
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
36
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3
37
    0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 4
38
    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 5
39
    0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 6
40
    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 7
41
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
42
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
43
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
44
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
45
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C
46
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D
47
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E
48
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F
49
  };
50
51
  static const int FOR_CONFIG = 1;
52
53
  static const KeyInfo lang_config_keys[] = {
54
    {"charset",             KeyInfoString, "iso-8859-1", ""}
55
    , {"data-encoding",       KeyInfoString, "<charset>", ""}
56
    , {"name",                KeyInfoString, "", ""}
57
    , {"run-together",        KeyInfoBool,   "", "", 0, FOR_CONFIG}
58
    , {"run-together-limit",  KeyInfoInt,    "", "", 0, FOR_CONFIG}
59
    , {"run-together-min",    KeyInfoInt,    "", "", 0, FOR_CONFIG}
60
    , {"soundslike",          KeyInfoString, "none", ""}
61
    , {"special",             KeyInfoString, "", ""}
62
    , {"ignore-accents" ,     KeyInfoBool, "", "", 0, FOR_CONFIG}
63
    , {"invisible-soundslike",KeyInfoBool, "", "", 0, FOR_CONFIG}
64
    , {"keyboard",            KeyInfoString, "standard", "", 0, FOR_CONFIG} 
65
    , {"affix",               KeyInfoString, "none", ""}
66
    , {"affix-compress",      KeyInfoBool, "false", "", 0, FOR_CONFIG}
67
    , {"partially-expand",    KeyInfoBool, "false", "", 0, FOR_CONFIG}
68
    , {"affix-char",          KeyInfoString, "/", "", 0, FOR_CONFIG}
69
    , {"flag-char",           KeyInfoString, ":", "", 0, FOR_CONFIG}
70
    , {"repl-table",          KeyInfoString, "none", ""}
71
    , {"sug-split-char",      KeyInfoList, "", "", 0, FOR_CONFIG}
72
    , {"store-as",            KeyInfoString, "", ""}
73
    , {"try",                 KeyInfoString, "", ""}
74
    , {"normalize",           KeyInfoBool, "false", "", 0, FOR_CONFIG}
75
    , {"norm-required",       KeyInfoBool, "false", "", 0, FOR_CONFIG}
76
    , {"norm-form",           KeyInfoString, "nfc", "", 0, FOR_CONFIG}
77
  };
78
79
  static GlobalCache<Language> language_cache("language");
80
81
  PosibErr<void> Language::setup(const String & lang, const Config * config)
82
1.09k
  {
83
    //
84
    // get_lang_info
85
    //
86
87
1.09k
    String dir1,dir2,path;
88
89
1.09k
    fill_data_dir(config, dir1, dir2);
90
1.09k
    dir_ = find_file(path,dir1,dir2,lang,".dat");
91
92
1.09k
    lang_config_ = 
93
1.09k
      new Config("speller-lang",
94
1.09k
                 lang_config_keys, 
95
1.09k
                 lang_config_keys + sizeof(lang_config_keys)/sizeof(KeyInfo));
96
1.09k
    Config & data = *lang_config_;
97
98
1.09k
    {
99
1.09k
      PosibErrBase pe = data.read_in_file(path);
100
1.09k
      if (pe.has_err(cant_read_file)) {
101
0
  String mesg = pe.get_err()->mesg;
102
0
  mesg[0] = asc_tolower(mesg[0]);
103
0
  mesg = _("This is probably because: ") + mesg;
104
0
  return make_err(unknown_language, lang, mesg);
105
1.09k
      } else if (pe.has_err())
106
0
  return pe;
107
1.09k
    }
108
109
1.09k
    if (!data.have("name"))
110
0
      return make_err(bad_file_format, path, _("The required field \"name\" is missing."));
111
112
1.09k
    String buf;
113
1.09k
    name_          = data.retrieve("name");
114
1.09k
    charset_       = fix_encoding_str(data.retrieve("charset"), buf);
115
1.09k
    charmap_       = charset_;
116
117
1.09k
    ConfigConvKey d_enc = data.retrieve_value("data-encoding");
118
1.09k
    d_enc.fix_encoding_str();
119
1.09k
    data_encoding_ = d_enc.val;
120
121
1.09k
    DataPair d;
122
123
    //
124
    // read header of cset data file
125
    //
126
  
127
1.09k
    FStream char_data;
128
1.09k
    String char_data_name;
129
1.09k
    find_file(char_data_name,dir1,dir2,charset_,".cset");
130
1.09k
    RET_ON_ERR(char_data.open(char_data_name, "r"));
131
    
132
1.09k
    String temp;
133
1.09k
    char * p;
134
3.27k
    do {
135
3.27k
      p = get_nb_line(char_data, temp);
136
3.27k
      if (*p == '=') {
137
1.09k
        ++p;
138
2.18k
        while (asc_isspace(*p)) ++p;
139
1.09k
        charmap_ = p;
140
1.09k
      }
141
3.27k
    } while (*p != '/');
142
143
    //
144
    // fill in tables
145
    //
146
147
280k
    for (unsigned int i = 0; i != 256; ++i) {
148
279k
      p = get_nb_line(char_data, temp);
149
279k
      if (!p || strtoul(p, &p, 16) != i) 
150
0
        return make_err(bad_file_format, char_data_name);
151
279k
      to_uni_[i] = strtol(p, &p, 16);
152
558k
      while (asc_isspace(*p)) ++p;
153
279k
      char_type_[i] = static_cast<CharType>(TO_CHAR_TYPE[to_uchar(*p++)]);
154
558k
      while (asc_isspace(*p)) ++p;
155
279k
      ++p; // display, ignored for now
156
279k
      CharInfo inf = char_type_[i] >= Letter ? LETTER : 0;
157
279k
      to_upper_[i] = static_cast<char>(strtol(p, &p, 16));
158
279k
      inf |= to_uchar(to_upper_[i]) == i ? UPPER : 0;
159
279k
      to_lower_[i] = static_cast<char>(strtol(p, &p, 16));
160
279k
      inf |= to_uchar(to_lower_[i]) == i ? LOWER : 0;
161
279k
      to_title_[i] = static_cast<char>(strtol(p, &p, 16));
162
279k
      inf |= to_uchar(to_title_[i]) == i ? TITLE : 0;
163
279k
      to_plain_[i] = static_cast<char>(strtol(p, &p, 16));
164
279k
      inf |= to_uchar(to_plain_[i]) == i ? PLAIN : 0;
165
279k
      inf |= to_uchar(to_plain_[i]) == 0 ? PLAIN : 0;
166
279k
      sl_first_[i] = static_cast<char>(strtol(p, &p, 16));
167
279k
      sl_rest_[i]  = static_cast<char>(strtol(p, &p, 16));
168
279k
      char_info_[i] = inf;
169
279k
    }
170
171
280k
    for (unsigned int i = 0; i != 256; ++i) {
172
279k
      de_accent_[i] = to_plain_[i] == 0 ? to_uchar(i) : to_plain_[i];
173
279k
    }
174
175
1.09k
    to_plain_[0] = 0x10; // to make things slightly easier
176
1.09k
    to_plain_[1] = 0x10;
177
178
280k
    for (unsigned int i = 0; i != 256; ++i) {
179
279k
      to_stripped_[i] = to_plain_[(unsigned char)to_lower_[i]];
180
279k
    }
181
    
182
1.09k
    char_data.close();
183
184
1.09k
    if (data.have("store-as"))
185
0
      buf = data.retrieve("store-as");
186
1.09k
    else if (data.retrieve_bool("affix-compress"))
187
43
      buf = "lower";
188
1.04k
    else
189
1.04k
      buf = "stripped";
190
1.09k
    char * clean_is;
191
1.09k
    if (buf == "stripped") {
192
1.04k
      store_as_ = Stripped;
193
1.04k
      clean_is = to_stripped_;
194
1.04k
    } else {
195
43
      store_as_ = Lower;
196
43
      clean_is = to_lower_;
197
43
    }
198
199
280k
    for (unsigned i = 0; i != 256; ++i) {
200
279k
      to_clean_[i] = char_type_[i] > NonLetter ? clean_is[i] : 0;
201
279k
      if ((unsigned char)to_clean_[i] == i) char_info_[i] |= CLEAN;
202
279k
    }
203
204
1.09k
    to_clean_[0x00] = 0x10; // to make things slightly easier
205
1.09k
    to_clean_[0x10] = 0x10;
206
207
1.09k
    clean_chars_   = get_clean_chars(*this);
208
209
    //
210
    // determine which mapping to use
211
    //
212
213
1.09k
    if (charmap_ != charset_) {
214
0
      if (file_exists(dir1 + charset_ + ".cmap") || 
215
0
          file_exists(dir2 + charset_ + ".cmap"))
216
0
      {
217
0
        charmap_ = charset_;
218
0
      } else if (data_encoding_ == charset_) {
219
0
        data_encoding_ = charmap_;
220
0
      }
221
0
    }
222
      
223
    //
224
    // set up conversions
225
    //
226
1.09k
    {
227
1.09k
#ifdef ENABLE_NLS
228
1.09k
      const char * tmp = 0;
229
1.09k
      tmp = bind_textdomain_codeset("aspell", 0);
230
1.09k
#ifdef HAVE_LANGINFO_CODESET
231
1.09k
      if (!tmp) tmp = nl_langinfo(CODESET);
232
1.09k
#endif
233
1.09k
      if (ascii_encoding(*config, tmp)) tmp = 0;
234
1.09k
      if (tmp)
235
0
        RET_ON_ERR(mesg_conv_.setup(*config, charmap_, fix_encoding_str(tmp, buf), NormTo));
236
1.09k
      else 
237
1.09k
#endif
238
1.09k
        RET_ON_ERR(mesg_conv_.setup(*config, charmap_, data_encoding_, NormTo));
239
      // no need to check for errors here since we know charmap_ is a
240
      // supported encoding
241
1.09k
      RET_ON_ERR(to_utf8_.setup(*config, charmap_, "utf-8", NormTo));
242
1.09k
      RET_ON_ERR(from_utf8_.setup(*config, "utf-8", charmap_, NormFrom));
243
1.09k
    }
244
    
245
1.09k
    Conv iconv;
246
1.09k
    RET_ON_ERR(iconv.setup(*config, data_encoding_, charmap_, NormFrom));
247
248
    //
249
    // set up special
250
    //
251
252
1.09k
    init(data.retrieve("special"), d, buf);
253
2.35k
    while (split(d)) {
254
1.26k
      char c = iconv(d.key)[0];
255
1.26k
      split(d);
256
1.26k
      special_[to_uchar(c)] = 
257
1.26k
        SpecialChar (d.key[0] == '*',d.key[1] == '*', d.key[2] == '*');
258
1.26k
    }
259
260
    //
261
    // prep phonetic code
262
    //
263
264
1.09k
    {
265
1.09k
      PosibErr<Soundslike *> pe = new_soundslike(data.retrieve("soundslike"),
266
1.09k
                                                 iconv,
267
1.09k
                                               this);
268
1.09k
      if (pe.has_err()) return pe;
269
1.09k
      soundslike_.reset(pe.data);
270
1.09k
    }
271
0
    soundslike_chars_ = soundslike_->soundslike_chars();
272
273
1.09k
    have_soundslike_ = strcmp(soundslike_->name(), "none") != 0;
274
275
    //
276
    // prep affix code
277
    //
278
1.09k
    {
279
1.09k
      PosibErr<AffixMgr *> pe = new_affix_mgr(data.retrieve("affix"), iconv, this);
280
1.09k
      if (pe.has_err()) return pe;
281
1.09k
      affix_.reset(pe.data);
282
1.09k
    }
283
284
    //
285
    // fill repl tables (if any)
286
    //
287
288
0
    String repl = data.retrieve("repl-table");
289
1.09k
    have_repl_ = false;
290
1.09k
    if (repl != "none") {
291
292
43
      String repl_file;
293
43
      FStream REPL;
294
43
      find_file(repl_file, dir1, dir2, repl, "_repl", ".dat");
295
43
      RET_ON_ERR(REPL.open(repl_file, "r"));
296
      
297
43
      size_t num_repl = 0;
298
3.52k
      while (getdata_pair(REPL, d, buf)) {
299
3.52k
        ::to_lower(d.key);
300
3.52k
        if (d.key == "rep") {
301
43
          num_repl = atoi(d.value); // FIXME make this more robust
302
43
          break;
303
43
        }
304
3.52k
      }
305
306
43
      if (num_repl > 0)
307
43
        have_repl_ = true;
308
309
45.4k
      for (size_t i = 0; i != num_repl; ++i) {
310
45.3k
        bool res = getdata_pair(REPL, d, buf);
311
45.3k
        assert(res); // FIXME
312
45.3k
        ::to_lower(d.key);
313
45.3k
        assert(d.key == "rep"); // FIXME
314
45.3k
        split(d);
315
45.3k
        SuggestRepl rep;
316
45.3k
        rep.substr = buf_.dup(iconv(d.key));
317
45.3k
        if (check_if_valid(*this, rep.substr).get_err()) 
318
23.6k
          continue; // FIXME: This should probably be an error, but
319
                    // this may cause problems with compatibility with
320
                    // Myspell as these entries may make sense for
321
                    // Myspell (but obviously not for Aspell)
322
21.7k
        to_clean((char *)rep.substr, rep.substr);
323
21.7k
        rep.repl   = buf_.dup(iconv(d.value));
324
21.7k
        if (check_if_valid(*this, rep.repl).get_err()) 
325
3.26k
          continue; // FIXME: Ditto
326
18.4k
        to_clean((char *)rep.repl, rep.repl);
327
18.4k
        if (strcmp(rep.substr, rep.repl) == 0 || rep.substr[0] == '\0')
328
215
          continue; // FIXME: Ditto
329
18.2k
        repls_.push_back(rep);
330
18.2k
      }
331
332
43
    }
333
1.09k
    return no_err;
334
1.09k
  }
335
336
  PosibErr<void> Language::set_lang_defaults(Config & config) const
337
7.53k
  {
338
7.53k
    config.replace_internal("actual-lang", name());
339
7.53k
    RET_ON_ERR(config.lang_config_merge(*lang_config_, FOR_CONFIG, data_encoding_));
340
7.53k
    return no_err;
341
7.53k
  }
342
343
  WordInfo Language::get_word_info(ParmStr str) const
344
0
  {
345
0
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
346
0
    const char * p = str;
347
0
    while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
348
0
    while (*p) all &= char_info(*p++);
349
0
    WordInfo res;
350
0
    if      (all & LOWER)   res = AllLower;
351
0
    else if (all & UPPER)   res = AllUpper;
352
0
    else if (first & TITLE) res = FirstUpper;
353
0
    else                    res = Other;
354
0
    if (all & PLAIN)  res |= ALL_PLAIN;
355
0
    if (all & CLEAN)  res |= ALL_CLEAN;
356
0
    return res;
357
0
  }
358
  
359
  CasePattern Language::case_pattern(ParmStr str) const  
360
982k
  {
361
982k
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
362
982k
    const char * p = str;
363
1.89M
    while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
364
63.3M
    while (*p) all &= char_info(*p++);
365
982k
    if      (all & LOWER)   return AllLower;
366
80.4k
    else if (all & UPPER)   return AllUpper;
367
33.5k
    else if (first & TITLE) return FirstUpper;
368
18.1k
    else                    return Other;
369
982k
  }
370
371
  CasePattern Language::case_pattern(const char * str, unsigned size) const  
372
27.1k
  {
373
27.1k
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
374
27.1k
    const char * p = str;
375
27.1k
    const char * end = p + size;
376
27.1k
    while (p < end && (first = char_info(*p++), all &= first, !(first & LETTER)));
377
43.8k
    while (p < end) all &= char_info(*p++);
378
27.1k
    if      (all & LOWER)   return AllLower;
379
21.8k
    else if (all & UPPER)   return AllUpper;
380
2.96k
    else if (first & TITLE) return FirstUpper;
381
22
    else                    return Other;
382
27.1k
  }
383
  
384
  void Language::fix_case(CasePattern case_pattern,
385
                          char * res, const char * str) const 
386
1.49M
  {
387
1.49M
    if (!str[0]) return;
388
1.49M
    if (case_pattern == AllUpper) {
389
173k
      to_upper(res,str);
390
1.49M
    } if (case_pattern == FirstUpper && is_lower(str[0])) {
391
131k
      *res = to_title(str[0]);
392
131k
      if (res == str) return;
393
0
      res++;
394
0
      str++;
395
0
      while (*str) *res++ = *str++;
396
0
      *res = '\0';
397
1.35M
    } else {
398
1.35M
      if (res == str) return;
399
0
      while (*str) *res++ = *str++;
400
0
      *res = '\0';
401
0
    }
402
1.49M
  }
403
404
  const char * Language::fix_case(CasePattern case_pattern, const char * str,
405
                                  String & buf) const 
406
0
  {
407
0
    if (!str[0]) return str;
408
0
    if (case_pattern == AllUpper) {
409
0
      to_upper(buf,str);
410
0
      return buf.str();
411
0
    } if (case_pattern == FirstUpper && is_lower(str[0])) {
412
0
      buf.clear();
413
0
      buf += to_title(str[0]);
414
0
      str++;
415
0
      while (*str) buf += *str++;
416
0
      return buf.str();
417
0
    } else {
418
0
      return str;
419
0
    }
420
0
  }
421
422
  WordAff * Language::fake_expand(ParmStr word, ParmStr aff, 
423
                                  ObjStack & buf) const 
424
0
  {
425
0
    WordAff * cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
426
0
    cur->word = buf.dup(word);
427
0
    cur->aff = (unsigned char *)buf.dup("");
428
0
    cur->next = 0;
429
0
    return cur;
430
0
  }
431
432
  CompoundWord Language::split_word(const char * word, unsigned len,
433
                                    bool camel_case) const
434
110k
  {
435
110k
    if (!camel_case || len <= 1)
436
74.2k
      return CompoundWord(word, word + len);
437
    // len >= 2
438
36.2k
    if (is_upper(word[0])) {
439
23.7k
      if (is_lower(word[1])) {
440
16.1k
        unsigned i = 2;
441
41.9k
        while (i < len && is_lower(word[i]))
442
25.7k
          ++i;
443
16.1k
        return CompoundWord(word, word + i, word + len);
444
16.1k
      }
445
7.56k
      if (is_upper(word[1])) {
446
7.56k
        unsigned i = 2;
447
43.5k
        while (i < len && is_upper(word[i]))
448
35.9k
          ++i;
449
7.56k
        if (i == len)
450
2.15k
          return CompoundWord(word, word + len);
451
        // The first upper case letter is assumed to be part of the next word
452
5.40k
        return CompoundWord(word, word + i - 1, word + len);
453
7.56k
      }
454
12.5k
    } else if (is_lower(word[0])) {
455
12.5k
      unsigned i = 1;
456
94.2k
      while (i < len && is_lower(word[i]))
457
81.7k
        ++i;
458
12.5k
      return CompoundWord(word, word + i, word + len);
459
12.5k
    }
460
    // this should't happen but just in case...
461
0
    return CompoundWord(word, word + len);
462
36.2k
  }
463
  
464
  bool SensitiveCompare::operator() (const char * word0, 
465
             const char * inlist0) const
466
148k
  {
467
148k
    assert(*word0 && *inlist0);
468
148k
  try_again:
469
148k
    const char * word = word0;
470
148k
    const char * inlist = inlist0;
471
472
148k
    if (!case_insensitive) {
473
      
474
147k
      if (begin) {
475
142k
        if (*word == *inlist || *word == lang->to_title(*inlist)) ++word, ++inlist;
476
127k
        else                                                      goto try_upper;
477
142k
      }
478
40.2k
      while (*word && *inlist && *word == *inlist) ++word, ++inlist;
479
19.8k
      if (*inlist) goto try_upper;
480
13.7k
      if (end && lang->special(*word).end) ++word;
481
13.7k
      if (*word) goto try_upper;
482
12.2k
      return true;
483
135k
    try_upper:
484
135k
      word = word0;
485
135k
      inlist = inlist0;
486
141k
      while (*word && *inlist && *word == lang->to_upper(*inlist)) ++word, ++inlist;
487
135k
      if (*inlist) goto fail;
488
3.42k
      if (end && lang->special(*word).end) ++word;
489
3.42k
      if (*word) goto fail;
490
      
491
3.42k
    } else { // case_insensitive
492
      
493
1.38k
      while (*word && *inlist && 
494
1.03k
             lang->to_upper(*word) == lang->to_upper(*inlist)) ++word, ++inlist;
495
846
      if (*inlist) goto fail;
496
351
      if (end && lang->special(*word).end) ++word;
497
351
      if (*word) goto fail;
498
      
499
351
    }
500
2.07k
    return true;
501
502
133k
  fail:
503
133k
    if (begin && lang->special(*word0).begin) {++word0; goto try_again;}
504
133k
    return false;
505
133k
  }
506
507
  static PosibErrBase invalid_word_e(const Language & l,
508
                                     ParmStr word,
509
                                     const char * msg,
510
                                     char chr = 0)
511
26.9k
  {
512
26.9k
    char m[200];
513
26.9k
    if (chr) {
514
      // the "char *" cast is needed due to an incorrect "snprintf"
515
      //   declaration on some platforms.
516
26.9k
      snprintf(m, 200, (char *)msg, MsgConv(l)(chr), l.to_uni(chr));
517
26.9k
      msg = m;
518
26.9k
    }
519
26.9k
    return make_err(invalid_word, MsgConv(l)(word), msg);
520
26.9k
  }
521
522
67.0k
  PosibErr<void> check_if_sane(const Language & l, ParmStr word) {
523
67.0k
    if (*word == '\0') 
524
0
      return invalid_word_e(l, word, _("Empty string."));
525
67.0k
    return no_err;
526
67.0k
  }
527
528
67.0k
  PosibErr<void> check_if_valid(const Language & l, ParmStr word) {
529
67.0k
    RET_ON_ERR(check_if_sane(l, word));
530
67.0k
    const char * i = word;
531
67.0k
    if (!l.is_alpha(*i)) {
532
22.8k
      if (!l.special(*i).begin)
533
22.8k
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the beginning of a word."), *i);
534
0
      else if (!l.is_alpha(*(i+1)))
535
0
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
536
0
      else if (!*(i+1))
537
0
        return invalid_word_e(l, word, _("Does not contain any alphabetic characters."));
538
22.8k
    }
539
155k
    for (;*(i+1) != '\0'; ++i) { 
540
111k
      if (!l.is_alpha(*i)) {
541
5.37k
        if (!l.special(*i).middle)
542
387
          return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear in the middle of a word."), *i);
543
4.98k
        else if (!l.is_alpha(*(i+1)))
544
0
          return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
545
5.37k
      }
546
111k
    }
547
43.8k
    if (!l.is_alpha(*i)) {
548
3.65k
      if (*i == '\r')
549
0
        return invalid_word_e(l, word, _("The character '\\r' (U+0D) may not appear at the end of a word. " 
550
0
                                         "This probably means means that the file is using MS-DOS EOL instead of Unix EOL."), *i);
551
3.65k
      if (!l.special(*i).end)
552
3.65k
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the end of a word."), *i);
553
3.65k
    }
554
40.1k
    return no_err;
555
43.8k
  }
556
557
  PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff)
558
0
  {
559
0
    for (const char * a = aff; *a; ++a) {
560
0
      CheckAffixRes res = l.affix()->check_affix(word, *a);
561
0
      if (res == InvalidAffix)
562
0
        return make_err(invalid_affix, MsgConv(l)(*a), MsgConv(l)(word));
563
0
      else if (res == InapplicableAffix)
564
0
        return make_err(inapplicable_affix, MsgConv(l)(*a), MsgConv(l)(word));
565
0
    }
566
0
    return no_err;
567
0
  }
568
569
  CleanAffix::CleanAffix(const Language * lang0, OStream * log0)
570
15
    : lang(lang0), log(log0), msgconv1(lang0), msgconv2(lang0)
571
15
  {
572
15
  }
573
  
574
  char * CleanAffix::operator()(ParmStr word, char * aff)
575
0
  {
576
0
    char * r = aff;
577
0
    for (const char * a = aff; *a; ++a) {
578
0
      CheckAffixRes res = lang->affix()->check_affix(word, *a);
579
0
      if (res == ValidAffix) {
580
0
        *r = *a;
581
0
        ++r;
582
0
      } else if (log) {
583
0
        const char * msg = res == InvalidAffix 
584
0
          ? _("Warning: Removing invalid affix '%s' from word %s.\n")
585
0
          : _("Warning: Removing inapplicable affix '%s' from word %s.\n");
586
0
        log->printf(msg, msgconv1(*a), msgconv2(word));
587
0
      }
588
0
    }
589
0
    *r = '\0';
590
0
    return r;
591
0
  }
592
593
0
  String get_stripped_chars(const Language & lang) {
594
0
    bool chars_set[256] = {0};
595
0
    String     chars_list;
596
0
    for (int i = 0; i != 256; ++i) 
597
0
    {
598
0
      char c = static_cast<char>(i);
599
0
  if (lang.is_alpha(c) || lang.special(c).any)
600
0
    chars_set[static_cast<unsigned char>(lang.to_stripped(c))] = true;
601
0
    }
602
0
    for (int i = 1; i != 256; ++i) 
603
0
    {
604
0
      if (chars_set[i]) 
605
0
  chars_list += static_cast<char>(i);
606
0
    }
607
0
    return chars_list;
608
0
  }
609
610
1.09k
  String get_clean_chars(const Language & lang) {
611
1.09k
    bool chars_set[256] = {0};
612
1.09k
    String     chars_list;
613
280k
    for (int i = 0; i != 256; ++i) 
614
279k
    {
615
279k
      char c = static_cast<char>(i);
616
279k
      if (lang.is_alpha(c) || lang.special(c).any) 
617
124k
        chars_set[static_cast<unsigned char>(lang.to_clean(c))] = true;
618
279k
    }
619
279k
    for (int i = 1; i != 256; ++i) 
620
277k
    {
621
277k
      if (chars_set[i]) {
622
34.9k
  chars_list += static_cast<char>(i);
623
34.9k
      }
624
277k
    }
625
1.09k
    return chars_list;
626
1.09k
  }
627
628
  PosibErr<Language *> new_language(const Config & config, ParmStr lang)
629
7.53k
  {
630
7.53k
    if (!lang)
631
0
      return get_cache_data(&language_cache, &config, config.retrieve("lang"));
632
7.53k
    else
633
7.53k
      return get_cache_data(&language_cache, &config, lang);
634
7.53k
  }
635
636
  PosibErr<void> open_affix_file(const Config & c, FStream & f)
637
0
  {
638
0
    String lang = c.retrieve("lang");
639
640
0
    String dir1,dir2,path;
641
0
    fill_data_dir(&c, dir1, dir2);
642
0
    String dir = find_file(path,dir1,dir2,lang,".dat");
643
644
0
    String file;
645
0
    file += dir;
646
0
    file += '/';
647
0
    file += lang;
648
0
    file += "_affix.dat";
649
    
650
0
    RET_ON_ERR(f.open(file,"r"));
651
652
0
    return no_err;
653
0
  }
654
655
  bool find_language(Config & c)
656
0
  {
657
0
    String l_data = c.retrieve("lang");
658
0
    char * l = l_data.mstr();
659
660
0
    String dir1,dir2,path;
661
0
    fill_data_dir(&c, dir1, dir2);
662
663
0
    char * s = l + strlen(l);
664
665
0
    while (s > l) {
666
0
      find_file(path,dir1,dir2,l,".dat");
667
0
      if (file_exists(path)) {
668
0
        c.replace_internal("actual-lang", l);
669
0
        return true;
670
0
      }
671
0
      while (s > l && !(*s == '-' || *s == '_')) --s;
672
0
      *s = '\0';
673
0
    }
674
0
    return false;
675
0
  }
676
677
  WordListIterator::WordListIterator(StringEnumeration * in0,
678
                                   const Language * lang0,
679
                                   OStream * log0)
680
15
    : in(in0), lang(lang0), log(log0), val(), str(0), str_end(0),
681
15
      clean_affix(lang0, log0) {}
682
683
  PosibErr<void>  WordListIterator::init(Config & config)
684
0
  {
685
0
    if (!config.have("norm-strict"))
686
0
      config.replace("norm-strict", "true");
687
0
    have_affix = lang->have_affix();
688
0
    validate_words = config.retrieve_bool("validate-words");
689
0
    validate_affixes = config.retrieve_bool("validate-affixes");
690
0
    clean_words = config.retrieve_bool("clean-words");
691
0
    skip_invalid_words = config.retrieve_bool("skip-invalid-words");
692
0
    clean_affixes = config.retrieve_bool("clean-affixes");
693
0
    if (config.have("encoding")) {
694
0
      ConfigConvKey enc = config.retrieve_value("encoding");
695
0
      RET_ON_ERR(iconv.setup(config, enc, lang->charmap(),NormFrom));
696
0
    } else {
697
0
      RET_ON_ERR(iconv.setup(config, lang->data_encoding(), lang->charmap(), NormFrom));
698
0
    }
699
0
    return no_err;
700
0
  }
701
702
  PosibErr<void> WordListIterator::init_plain(Config & config)
703
15
  {
704
15
    if (!config.have("norm-strict"))
705
15
      config.replace("norm-strict", "true");
706
15
    have_affix = false;
707
15
    validate_words = config.retrieve_bool("validate-words");
708
15
    clean_words = true;
709
15
    if (config.have("clean-words"))
710
0
      clean_words = config.retrieve_bool("clean-words");
711
15
    skip_invalid_words = true;
712
15
    RET_ON_ERR(iconv.setup(config, "utf-8", lang->charmap(),NormFrom));
713
15
    return no_err;
714
15
  }
715
 
716
  PosibErr<bool> WordListIterator::adv() 
717
15
  {
718
15
  loop:
719
15
    if (!str) {
720
15
      orig = in->next();
721
15
      if (!orig) return false;
722
0
      if (!*orig) goto loop;
723
0
      PosibErr<const char *> pe = iconv(orig);
724
0
      if (pe.has_err()) {
725
0
        if (!skip_invalid_words) return pe;
726
0
        if (log) log->printf(_("Warning: %s Skipping string.\n"), pe.get_err()->mesg);
727
0
        else pe.ignore_err();
728
0
        goto loop;
729
0
      }
730
0
      if (pe.data == orig) {
731
0
        data = orig;
732
0
        data.ensure_null_end();
733
0
        str = data.pbegin();
734
0
        str_end = data.pend();
735
0
      } else {
736
0
        str = iconv.buf.pbegin();
737
0
        str_end = iconv.buf.pend();
738
0
      }
739
0
      char * aff = str_end;
740
0
      char * aff_end = str_end;
741
0
      if (have_affix) {
742
0
        aff = strchr(str, '/');
743
0
        if (aff == 0) {
744
0
          aff = str_end;
745
0
        } else {
746
0
          *aff = '\0';
747
0
          str_end = aff;
748
0
          ++aff;
749
0
        }
750
0
        if (validate_affixes) {
751
0
          if (clean_affixes)
752
0
            aff_end = clean_affix(str, aff);
753
0
          else
754
0
            RET_ON_ERR(validate_affix(*lang, str, aff));
755
0
        }
756
0
      }
757
0
      val.aff.str = aff;
758
0
      val.aff.size = aff_end - aff;
759
0
      if (!*aff && validate_words && clean_words) {
760
0
        char * s = str;
761
0
        while (s < str_end && !lang->is_alpha(*s) && !lang->special(*s).begin)
762
0
          *s++ = '\0';
763
0
        char * s2 = str_end - 1;
764
0
        while (s2 >= str && *s2 && !lang->is_alpha(*s2) && !lang->special(*s2).end)
765
0
          *s2-- = '\0';
766
0
      }
767
0
    }
768
0
    while (str < str_end) 
769
0
    {
770
0
      if (!*str) {++str; continue;}
771
772
0
      PosibErrBase pe2 = validate_words ? check_if_valid(*lang, str) : no_err;
773
774
0
      val.word.str = str;
775
0
      val.word.size = strlen(str);
776
0
      str += val.word.size + 1;
777
778
0
      if (!pe2.has_err() && val.word.size + (*val.aff ? val.aff.size + 1 : 0) > 240)
779
0
        pe2 = make_err(invalid_word, MsgConv(lang)(val.word),
780
0
                       _("The total length is larger than 240 characters."));
781
782
0
      if (!pe2.has_err()) return true;
783
0
      if (!skip_invalid_words) return pe2;
784
0
      if (log) log->printf(_("Warning: %s Skipping word.\n"), pe2.get_err()->mesg);
785
0
      else pe2.ignore_err();
786
0
    } 
787
0
    str = 0;
788
0
    goto loop;
789
0
  }
790
}