Coverage Report

Created: 2025-08-26 06:57

/src/aspell/modules/speller/default/language.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2000 by Kevin Atkinson under the terms of the LGPL
2
3
#include "settings.h"
4
5
#include <vector>
6
#include <assert.h>
7
8
#include <iostream.hpp>
9
10
#include "asc_ctype.hpp"
11
#include "clone_ptr-t.hpp"
12
#include "config.hpp"
13
#include "enumeration.hpp"
14
#include "errors.hpp"
15
#include "file_data_util.hpp"
16
#include "fstream.hpp"
17
#include "language.hpp"
18
#include "string.hpp"
19
#include "cache-t.hpp"
20
#include "getdata.hpp"
21
#include "file_util.hpp"
22
23
#ifdef ENABLE_NLS
24
#  include <langinfo.h>
25
#endif
26
27
#include "gettext.h"
28
29
namespace aspeller {
30
31
  static const char TO_CHAR_TYPE[256] = {
32
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F 
33
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
34
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
35
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
36
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3
37
    0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 4
38
    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 5
39
    0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 6
40
    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 7
41
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
42
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
43
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
44
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
45
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C
46
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D
47
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E
48
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F
49
  };
50
51
  static const int FOR_CONFIG = 1;
52
53
  static const KeyInfo lang_config_keys[] = {
54
    {"charset",             KeyInfoString, "iso-8859-1", ""}
55
    , {"data-encoding",       KeyInfoString, "<charset>", ""}
56
    , {"name",                KeyInfoString, "", ""}
57
    , {"run-together",        KeyInfoBool,   "", "", 0, FOR_CONFIG}
58
    , {"run-together-limit",  KeyInfoInt,    "", "", 0, FOR_CONFIG}
59
    , {"run-together-min",    KeyInfoInt,    "", "", 0, FOR_CONFIG}
60
    , {"soundslike",          KeyInfoString, "none", ""}
61
    , {"special",             KeyInfoString, "", ""}
62
    , {"ignore-accents" ,     KeyInfoBool, "", "", 0, FOR_CONFIG}
63
    , {"invisible-soundslike",KeyInfoBool, "", "", 0, FOR_CONFIG}
64
    , {"keyboard",            KeyInfoString, "standard", "", 0, FOR_CONFIG} 
65
    , {"affix",               KeyInfoString, "none", ""}
66
    , {"affix-compress",      KeyInfoBool, "false", "", 0, FOR_CONFIG}
67
    , {"partially-expand",    KeyInfoBool, "false", "", 0, FOR_CONFIG}
68
    , {"affix-char",          KeyInfoString, "/", "", 0, FOR_CONFIG}
69
    , {"flag-char",           KeyInfoString, ":", "", 0, FOR_CONFIG}
70
    , {"repl-table",          KeyInfoString, "none", ""}
71
    , {"sug-split-char",      KeyInfoList, "", "", 0, FOR_CONFIG}
72
    , {"store-as",            KeyInfoString, "", ""}
73
    , {"try",                 KeyInfoString, "", ""}
74
    , {"normalize",           KeyInfoBool, "false", "", 0, FOR_CONFIG}
75
    , {"norm-required",       KeyInfoBool, "false", "", 0, FOR_CONFIG}
76
    , {"norm-form",           KeyInfoString, "nfc", "", 0, FOR_CONFIG}
77
  };
78
79
  static GlobalCache<Language> language_cache("language");
80
81
  PosibErr<void> Language::setup(const String & lang, const Config * config)
82
467
  {
83
    //
84
    // get_lang_info
85
    //
86
87
467
    String dir1,dir2,path;
88
89
467
    fill_data_dir(config, dir1, dir2);
90
467
    dir_ = find_file(path,dir1,dir2,lang,".dat");
91
92
467
    lang_config_ = 
93
467
      new Config("speller-lang",
94
467
                 lang_config_keys, 
95
467
                 lang_config_keys + sizeof(lang_config_keys)/sizeof(KeyInfo));
96
467
    Config & data = *lang_config_;
97
98
467
    {
99
467
      PosibErrBase pe = data.read_in_file(path);
100
467
      if (pe.has_err(cant_read_file)) {
101
0
  String mesg = pe.get_err()->mesg;
102
0
  mesg[0] = asc_tolower(mesg[0]);
103
0
  mesg = _("This is probably because: ") + mesg;
104
0
  return make_err(unknown_language, lang, mesg);
105
467
      } else if (pe.has_err())
106
0
  return pe;
107
467
    }
108
109
467
    if (!data.have("name"))
110
0
      return make_err(bad_file_format, path, _("The required field \"name\" is missing."));
111
112
467
    String buf;
113
467
    name_          = data.retrieve("name");
114
467
    charset_       = fix_encoding_str(data.retrieve("charset"), buf);
115
467
    charmap_       = charset_;
116
117
467
    ConfigConvKey d_enc = data.retrieve_value("data-encoding");
118
467
    d_enc.fix_encoding_str();
119
467
    data_encoding_ = d_enc.val;
120
121
467
    DataPair d;
122
123
    //
124
    // read header of cset data file
125
    //
126
  
127
467
    FStream char_data;
128
467
    String char_data_name;
129
467
    find_file(char_data_name,dir1,dir2,charset_,".cset");
130
467
    RET_ON_ERR(char_data.open(char_data_name, "r"));
131
    
132
467
    String temp;
133
467
    char * p;
134
1.40k
    do {
135
1.40k
      p = get_nb_line(char_data, temp);
136
1.40k
      if (*p == '=') {
137
467
        ++p;
138
934
        while (asc_isspace(*p)) ++p;
139
467
        charmap_ = p;
140
467
      }
141
1.40k
    } while (*p != '/');
142
143
    //
144
    // fill in tables
145
    //
146
147
120k
    for (unsigned int i = 0; i != 256; ++i) {
148
119k
      p = get_nb_line(char_data, temp);
149
119k
      if (!p || strtoul(p, &p, 16) != i) 
150
0
        return make_err(bad_file_format, char_data_name);
151
119k
      to_uni_[i] = strtol(p, &p, 16);
152
239k
      while (asc_isspace(*p)) ++p;
153
119k
      char_type_[i] = static_cast<CharType>(TO_CHAR_TYPE[to_uchar(*p++)]);
154
239k
      while (asc_isspace(*p)) ++p;
155
119k
      ++p; // display, ignored for now
156
119k
      CharInfo inf = char_type_[i] >= Letter ? LETTER : 0;
157
119k
      to_upper_[i] = static_cast<char>(strtol(p, &p, 16));
158
119k
      inf |= to_uchar(to_upper_[i]) == i ? UPPER : 0;
159
119k
      to_lower_[i] = static_cast<char>(strtol(p, &p, 16));
160
119k
      inf |= to_uchar(to_lower_[i]) == i ? LOWER : 0;
161
119k
      to_title_[i] = static_cast<char>(strtol(p, &p, 16));
162
119k
      inf |= to_uchar(to_title_[i]) == i ? TITLE : 0;
163
119k
      to_plain_[i] = static_cast<char>(strtol(p, &p, 16));
164
119k
      inf |= to_uchar(to_plain_[i]) == i ? PLAIN : 0;
165
119k
      inf |= to_uchar(to_plain_[i]) == 0 ? PLAIN : 0;
166
119k
      sl_first_[i] = static_cast<char>(strtol(p, &p, 16));
167
119k
      sl_rest_[i]  = static_cast<char>(strtol(p, &p, 16));
168
119k
      char_info_[i] = inf;
169
119k
    }
170
171
120k
    for (unsigned int i = 0; i != 256; ++i) {
172
119k
      de_accent_[i] = to_plain_[i] == 0 ? to_uchar(i) : to_plain_[i];
173
119k
    }
174
175
467
    to_plain_[0] = 0x10; // to make things slightly easier
176
467
    to_plain_[1] = 0x10;
177
178
120k
    for (unsigned int i = 0; i != 256; ++i) {
179
119k
      to_stripped_[i] = to_plain_[(unsigned char)to_lower_[i]];
180
119k
    }
181
    
182
467
    char_data.close();
183
184
467
    if (data.have("store-as"))
185
0
      buf = data.retrieve("store-as");
186
467
    else if (data.retrieve_bool("affix-compress"))
187
21
      buf = "lower";
188
446
    else
189
446
      buf = "stripped";
190
467
    char * clean_is;
191
467
    if (buf == "stripped") {
192
446
      store_as_ = Stripped;
193
446
      clean_is = to_stripped_;
194
446
    } else {
195
21
      store_as_ = Lower;
196
21
      clean_is = to_lower_;
197
21
    }
198
199
120k
    for (unsigned i = 0; i != 256; ++i) {
200
119k
      to_clean_[i] = char_type_[i] > NonLetter ? clean_is[i] : 0;
201
119k
      if ((unsigned char)to_clean_[i] == i) char_info_[i] |= CLEAN;
202
119k
    }
203
204
467
    to_clean_[0x00] = 0x10; // to make things slightly easier
205
467
    to_clean_[0x10] = 0x10;
206
207
467
    clean_chars_   = get_clean_chars(*this);
208
209
    //
210
    // determine which mapping to use
211
    //
212
213
467
    if (charmap_ != charset_) {
214
0
      if (file_exists(dir1 + charset_ + ".cmap") || 
215
0
          file_exists(dir2 + charset_ + ".cmap"))
216
0
      {
217
0
        charmap_ = charset_;
218
0
      } else if (data_encoding_ == charset_) {
219
0
        data_encoding_ = charmap_;
220
0
      }
221
0
    }
222
      
223
    //
224
    // set up conversions
225
    //
226
467
    {
227
467
#ifdef ENABLE_NLS
228
467
      const char * tmp = 0;
229
467
      tmp = bind_textdomain_codeset("aspell", 0);
230
467
#ifdef HAVE_LANGINFO_CODESET
231
467
      if (!tmp) tmp = nl_langinfo(CODESET);
232
467
#endif
233
467
      if (ascii_encoding(*config, tmp)) tmp = 0;
234
467
      if (tmp)
235
0
        RET_ON_ERR(mesg_conv_.setup(*config, charmap_, fix_encoding_str(tmp, buf), NormTo));
236
467
      else 
237
467
#endif
238
467
        RET_ON_ERR(mesg_conv_.setup(*config, charmap_, data_encoding_, NormTo));
239
      // no need to check for errors here since we know charmap_ is a
240
      // supported encoding
241
467
      RET_ON_ERR(to_utf8_.setup(*config, charmap_, "utf-8", NormTo));
242
467
      RET_ON_ERR(from_utf8_.setup(*config, "utf-8", charmap_, NormFrom));
243
467
    }
244
    
245
467
    Conv iconv;
246
467
    RET_ON_ERR(iconv.setup(*config, data_encoding_, charmap_, NormFrom));
247
248
    //
249
    // set up special
250
    //
251
252
467
    init(data.retrieve("special"), d, buf);
253
1.01k
    while (split(d)) {
254
551
      char c = iconv(d.key)[0];
255
551
      split(d);
256
551
      special_[to_uchar(c)] = 
257
551
        SpecialChar (d.key[0] == '*',d.key[1] == '*', d.key[2] == '*');
258
551
    }
259
260
    //
261
    // prep phonetic code
262
    //
263
264
467
    {
265
467
      PosibErr<Soundslike *> pe = new_soundslike(data.retrieve("soundslike"),
266
467
                                                 iconv,
267
467
                                               this);
268
467
      if (pe.has_err()) return pe;
269
467
      soundslike_.reset(pe.data);
270
467
    }
271
0
    soundslike_chars_ = soundslike_->soundslike_chars();
272
273
467
    have_soundslike_ = strcmp(soundslike_->name(), "none") != 0;
274
275
    //
276
    // prep affix code
277
    //
278
467
    {
279
467
      PosibErr<AffixMgr *> pe = new_affix_mgr(data.retrieve("affix"), iconv, this);
280
467
      if (pe.has_err()) return pe;
281
467
      affix_.reset(pe.data);
282
467
    }
283
284
    //
285
    // fill repl tables (if any)
286
    //
287
288
0
    String repl = data.retrieve("repl-table");
289
467
    have_repl_ = false;
290
467
    if (repl != "none") {
291
292
21
      String repl_file;
293
21
      FStream REPL;
294
21
      find_file(repl_file, dir1, dir2, repl, "_repl", ".dat");
295
21
      RET_ON_ERR(REPL.open(repl_file, "r"));
296
      
297
21
      size_t num_repl = 0;
298
1.72k
      while (getdata_pair(REPL, d, buf)) {
299
1.72k
        ::to_lower(d.key);
300
1.72k
        if (d.key == "rep") {
301
21
          num_repl = atoi(d.value); // FIXME make this more robust
302
21
          break;
303
21
        }
304
1.72k
      }
305
306
21
      if (num_repl > 0)
307
21
        have_repl_ = true;
308
309
22.1k
      for (size_t i = 0; i != num_repl; ++i) {
310
22.1k
        bool res = getdata_pair(REPL, d, buf);
311
22.1k
        assert(res); // FIXME
312
22.1k
        ::to_lower(d.key);
313
22.1k
        assert(d.key == "rep"); // FIXME
314
22.1k
        split(d);
315
22.1k
        SuggestRepl rep;
316
22.1k
        rep.substr = buf_.dup(iconv(d.key));
317
22.1k
        if (check_if_valid(*this, rep.substr).get_err()) 
318
11.5k
          continue; // FIXME: This should probably be an error, but
319
                    // this may cause problems with compatibility with
320
                    // Myspell as these entries may make sense for
321
                    // Myspell (but obviously not for Aspell)
322
10.6k
        to_clean((char *)rep.substr, rep.substr);
323
10.6k
        rep.repl   = buf_.dup(iconv(d.value));
324
10.6k
        if (check_if_valid(*this, rep.repl).get_err()) 
325
1.59k
          continue; // FIXME: Ditto
326
9.00k
        to_clean((char *)rep.repl, rep.repl);
327
9.00k
        if (strcmp(rep.substr, rep.repl) == 0 || rep.substr[0] == '\0')
328
105
          continue; // FIXME: Ditto
329
8.90k
        repls_.push_back(rep);
330
8.90k
      }
331
332
21
    }
333
467
    return no_err;
334
467
  }
335
336
  PosibErr<void> Language::set_lang_defaults(Config & config) const
337
3.23k
  {
338
3.23k
    config.replace_internal("actual-lang", name());
339
3.23k
    RET_ON_ERR(config.lang_config_merge(*lang_config_, FOR_CONFIG, data_encoding_));
340
3.23k
    return no_err;
341
3.23k
  }
342
343
  WordInfo Language::get_word_info(ParmStr str) const
344
0
  {
345
0
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
346
0
    const char * p = str;
347
0
    while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
348
0
    while (*p) all &= char_info(*p++);
349
0
    WordInfo res;
350
0
    if      (all & LOWER)   res = AllLower;
351
0
    else if (all & UPPER)   res = AllUpper;
352
0
    else if (first & TITLE) res = FirstUpper;
353
0
    else                    res = Other;
354
0
    if (all & PLAIN)  res |= ALL_PLAIN;
355
0
    if (all & CLEAN)  res |= ALL_CLEAN;
356
0
    return res;
357
0
  }
358
  
359
  CasePattern Language::case_pattern(ParmStr str) const  
360
422k
  {
361
422k
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
362
422k
    const char * p = str;
363
1.67M
    while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
364
16.5M
    while (*p) all &= char_info(*p++);
365
422k
    if      (all & LOWER)   return AllLower;
366
81.0k
    else if (all & UPPER)   return AllUpper;
367
21.3k
    else if (first & TITLE) return FirstUpper;
368
6.45k
    else                    return Other;
369
422k
  }
370
371
  CasePattern Language::case_pattern(const char * str, unsigned size) const  
372
35.9k
  {
373
35.9k
    CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
374
35.9k
    const char * p = str;
375
35.9k
    const char * end = p + size;
376
35.9k
    while (p < end && (first = char_info(*p++), all &= first, !(first & LETTER)));
377
57.5k
    while (p < end) all &= char_info(*p++);
378
35.9k
    if      (all & LOWER)   return AllLower;
379
28.7k
    else if (all & UPPER)   return AllUpper;
380
3.29k
    else if (first & TITLE) return FirstUpper;
381
6
    else                    return Other;
382
35.9k
  }
383
  
384
  void Language::fix_case(CasePattern case_pattern,
385
                          char * res, const char * str) const 
386
685k
  {
387
685k
    if (!str[0]) return;
388
685k
    if (case_pattern == AllUpper) {
389
91.6k
      to_upper(res,str);
390
685k
    } if (case_pattern == FirstUpper && is_lower(str[0])) {
391
67.7k
      *res = to_title(str[0]);
392
67.7k
      if (res == str) return;
393
0
      res++;
394
0
      str++;
395
0
      while (*str) *res++ = *str++;
396
0
      *res = '\0';
397
617k
    } else {
398
617k
      if (res == str) return;
399
0
      while (*str) *res++ = *str++;
400
0
      *res = '\0';
401
0
    }
402
685k
  }
403
404
  const char * Language::fix_case(CasePattern case_pattern, const char * str,
405
                                  String & buf) const 
406
0
  {
407
0
    if (!str[0]) return str;
408
0
    if (case_pattern == AllUpper) {
409
0
      to_upper(buf,str);
410
0
      return buf.str();
411
0
    } if (case_pattern == FirstUpper && is_lower(str[0])) {
412
0
      buf.clear();
413
0
      buf += to_title(str[0]);
414
0
      str++;
415
0
      while (*str) buf += *str++;
416
0
      return buf.str();
417
0
    } else {
418
0
      return str;
419
0
    }
420
0
  }
421
422
  WordAff * Language::fake_expand(ParmStr word, ParmStr aff, 
423
                                  ObjStack & buf) const 
424
0
  {
425
0
    WordAff * cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
426
0
    cur->word = buf.dup(word);
427
0
    cur->aff = (unsigned char *)buf.dup("");
428
0
    cur->next = 0;
429
0
    return cur;
430
0
  }
431
432
  CompoundWord Language::split_word(const char * word, unsigned len,
433
                                    bool camel_case) const
434
51.4k
  {
435
51.4k
    if (!camel_case || len <= 1)
436
40.9k
      return CompoundWord(word, word + len);
437
    // len >= 2
438
10.5k
    if (is_upper(word[0])) {
439
7.87k
      if (is_lower(word[1])) {
440
5.45k
        unsigned i = 2;
441
47.5k
        while (i < len && is_lower(word[i]))
442
42.1k
          ++i;
443
5.45k
        return CompoundWord(word, word + i, word + len);
444
5.45k
      }
445
2.41k
      if (is_upper(word[1])) {
446
2.41k
        unsigned i = 2;
447
3.83k
        while (i < len && is_upper(word[i]))
448
1.41k
          ++i;
449
2.41k
        if (i == len)
450
647
          return CompoundWord(word, word + len);
451
        // The first upper case letter is assumed to be part of the next word
452
1.77k
        return CompoundWord(word, word + i - 1, word + len);
453
2.41k
      }
454
2.62k
    } else if (is_lower(word[0])) {
455
2.62k
      unsigned i = 1;
456
53.2k
      while (i < len && is_lower(word[i]))
457
50.6k
        ++i;
458
2.62k
      return CompoundWord(word, word + i, word + len);
459
2.62k
    }
460
    // this should't happen but just in case...
461
0
    return CompoundWord(word, word + len);
462
10.5k
  }
463
  
464
  bool SensitiveCompare::operator() (const char * word0, 
465
             const char * inlist0) const
466
183k
  {
467
183k
    assert(*word0 && *inlist0);
468
183k
  try_again:
469
183k
    const char * word = word0;
470
183k
    const char * inlist = inlist0;
471
472
183k
    if (!case_insensitive) {
473
      
474
183k
      if (begin) {
475
180k
        if (*word == *inlist || *word == lang->to_title(*inlist)) ++word, ++inlist;
476
171k
        else                                                      goto try_upper;
477
180k
      }
478
24.8k
      while (*word && *inlist && *word == *inlist) ++word, ++inlist;
479
11.4k
      if (*inlist) goto try_upper;
480
7.78k
      if (end && lang->special(*word).end) ++word;
481
7.78k
      if (*word) goto try_upper;
482
7.71k
      return true;
483
175k
    try_upper:
484
175k
      word = word0;
485
175k
      inlist = inlist0;
486
179k
      while (*word && *inlist && *word == lang->to_upper(*inlist)) ++word, ++inlist;
487
175k
      if (*inlist) goto fail;
488
1.27k
      if (end && lang->special(*word).end) ++word;
489
1.27k
      if (*word) goto fail;
490
      
491
1.27k
    } else { // case_insensitive
492
      
493
0
      while (*word && *inlist && 
494
0
             lang->to_upper(*word) == lang->to_upper(*inlist)) ++word, ++inlist;
495
0
      if (*inlist) goto fail;
496
0
      if (end && lang->special(*word).end) ++word;
497
0
      if (*word) goto fail;
498
      
499
0
    }
500
1.18k
    return true;
501
502
174k
  fail:
503
174k
    if (begin && lang->special(*word0).begin) {++word0; goto try_again;}
504
174k
    return false;
505
174k
  }
506
507
  static PosibErrBase invalid_word_e(const Language & l,
508
                                     ParmStr word,
509
                                     const char * msg,
510
                                     char chr = 0)
511
13.1k
  {
512
13.1k
    char m[200];
513
13.1k
    if (chr) {
514
      // the "char *" cast is needed due to an incorrect "snprintf"
515
      //   declaration on some platforms.
516
13.1k
      snprintf(m, 200, (char *)msg, MsgConv(l)(chr), l.to_uni(chr));
517
13.1k
      msg = m;
518
13.1k
    }
519
13.1k
    return make_err(invalid_word, MsgConv(l)(word), msg);
520
13.1k
  }
521
522
32.7k
  PosibErr<void> check_if_sane(const Language & l, ParmStr word) {
523
32.7k
    if (*word == '\0') 
524
0
      return invalid_word_e(l, word, _("Empty string."));
525
32.7k
    return no_err;
526
32.7k
  }
527
528
32.7k
  PosibErr<void> check_if_valid(const Language & l, ParmStr word) {
529
32.7k
    RET_ON_ERR(check_if_sane(l, word));
530
32.7k
    const char * i = word;
531
32.7k
    if (!l.is_alpha(*i)) {
532
11.1k
      if (!l.special(*i).begin)
533
11.1k
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the beginning of a word."), *i);
534
0
      else if (!l.is_alpha(*(i+1)))
535
0
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
536
0
      else if (!*(i+1))
537
0
        return invalid_word_e(l, word, _("Does not contain any alphabetic characters."));
538
11.1k
    }
539
76.0k
    for (;*(i+1) != '\0'; ++i) { 
540
54.6k
      if (!l.is_alpha(*i)) {
541
2.62k
        if (!l.special(*i).middle)
542
189
          return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear in the middle of a word."), *i);
543
2.43k
        else if (!l.is_alpha(*(i+1)))
544
0
          return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
545
2.62k
      }
546
54.6k
    }
547
21.3k
    if (!l.is_alpha(*i)) {
548
1.78k
      if (*i == '\r')
549
0
        return invalid_word_e(l, word, _("The character '\\r' (U+0D) may not appear at the end of a word. " 
550
0
                                         "This probably means means that the file is using MS-DOS EOL instead of Unix EOL."), *i);
551
1.78k
      if (!l.special(*i).end)
552
1.78k
        return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the end of a word."), *i);
553
1.78k
    }
554
19.6k
    return no_err;
555
21.3k
  }
556
557
  PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff)
558
0
  {
559
0
    for (const char * a = aff; *a; ++a) {
560
0
      CheckAffixRes res = l.affix()->check_affix(word, *a);
561
0
      if (res == InvalidAffix)
562
0
        return make_err(invalid_affix, MsgConv(l)(*a), MsgConv(l)(word));
563
0
      else if (res == InapplicableAffix)
564
0
        return make_err(inapplicable_affix, MsgConv(l)(*a), MsgConv(l)(word));
565
0
    }
566
0
    return no_err;
567
0
  }
568
569
  CleanAffix::CleanAffix(const Language * lang0, OStream * log0)
570
8
    : lang(lang0), log(log0), msgconv1(lang0), msgconv2(lang0)
571
8
  {
572
8
  }
573
  
574
  char * CleanAffix::operator()(ParmStr word, char * aff)
575
0
  {
576
0
    char * r = aff;
577
0
    for (const char * a = aff; *a; ++a) {
578
0
      CheckAffixRes res = lang->affix()->check_affix(word, *a);
579
0
      if (res == ValidAffix) {
580
0
        *r = *a;
581
0
        ++r;
582
0
      } else if (log) {
583
0
        const char * msg = res == InvalidAffix 
584
0
          ? _("Warning: Removing invalid affix '%s' from word %s.\n")
585
0
          : _("Warning: Removing inapplicable affix '%s' from word %s.\n");
586
0
        log->printf(msg, msgconv1(*a), msgconv2(word));
587
0
      }
588
0
    }
589
0
    *r = '\0';
590
0
    return r;
591
0
  }
592
593
0
  String get_stripped_chars(const Language & lang) {
594
0
    bool chars_set[256] = {0};
595
0
    String     chars_list;
596
0
    for (int i = 0; i != 256; ++i) 
597
0
    {
598
0
      char c = static_cast<char>(i);
599
0
  if (lang.is_alpha(c) || lang.special(c).any)
600
0
    chars_set[static_cast<unsigned char>(lang.to_stripped(c))] = true;
601
0
    }
602
0
    for (int i = 1; i != 256; ++i) 
603
0
    {
604
0
      if (chars_set[i]) 
605
0
  chars_list += static_cast<char>(i);
606
0
    }
607
0
    return chars_list;
608
0
  }
609
610
467
  String get_clean_chars(const Language & lang) {
611
467
    bool chars_set[256] = {0};
612
467
    String     chars_list;
613
120k
    for (int i = 0; i != 256; ++i) 
614
119k
    {
615
119k
      char c = static_cast<char>(i);
616
119k
      if (lang.is_alpha(c) || lang.special(c).any) 
617
53.2k
        chars_set[static_cast<unsigned char>(lang.to_clean(c))] = true;
618
119k
    }
619
119k
    for (int i = 1; i != 256; ++i) 
620
119k
    {
621
119k
      if (chars_set[i]) {
622
15.0k
  chars_list += static_cast<char>(i);
623
15.0k
      }
624
119k
    }
625
467
    return chars_list;
626
467
  }
627
628
  PosibErr<Language *> new_language(const Config & config, ParmStr lang)
629
3.23k
  {
630
3.23k
    if (!lang)
631
0
      return get_cache_data(&language_cache, &config, config.retrieve("lang"));
632
3.23k
    else
633
3.23k
      return get_cache_data(&language_cache, &config, lang);
634
3.23k
  }
635
636
  PosibErr<void> open_affix_file(const Config & c, FStream & f)
637
0
  {
638
0
    String lang = c.retrieve("lang");
639
640
0
    String dir1,dir2,path;
641
0
    fill_data_dir(&c, dir1, dir2);
642
0
    String dir = find_file(path,dir1,dir2,lang,".dat");
643
644
0
    String file;
645
0
    file += dir;
646
0
    file += '/';
647
0
    file += lang;
648
0
    file += "_affix.dat";
649
    
650
0
    RET_ON_ERR(f.open(file,"r"));
651
652
0
    return no_err;
653
0
  }
654
655
  bool find_language(Config & c)
656
0
  {
657
0
    String l_data = c.retrieve("lang");
658
0
    char * l = l_data.mstr();
659
660
0
    String dir1,dir2,path;
661
0
    fill_data_dir(&c, dir1, dir2);
662
663
0
    char * s = l + strlen(l);
664
665
0
    while (s > l) {
666
0
      find_file(path,dir1,dir2,l,".dat");
667
0
      if (file_exists(path)) {
668
0
        c.replace_internal("actual-lang", l);
669
0
        return true;
670
0
      }
671
0
      while (s > l && !(*s == '-' || *s == '_')) --s;
672
0
      *s = '\0';
673
0
    }
674
0
    return false;
675
0
  }
676
677
  WordListIterator::WordListIterator(StringEnumeration * in0,
678
                                   const Language * lang0,
679
                                   OStream * log0)
680
8
    : in(in0), lang(lang0), log(log0), val(), str(0), str_end(0),
681
8
      clean_affix(lang0, log0) {}
682
683
  PosibErr<void>  WordListIterator::init(Config & config)
684
0
  {
685
0
    if (!config.have("norm-strict"))
686
0
      config.replace("norm-strict", "true");
687
0
    have_affix = lang->have_affix();
688
0
    validate_words = config.retrieve_bool("validate-words");
689
0
    validate_affixes = config.retrieve_bool("validate-affixes");
690
0
    clean_words = config.retrieve_bool("clean-words");
691
0
    skip_invalid_words = config.retrieve_bool("skip-invalid-words");
692
0
    clean_affixes = config.retrieve_bool("clean-affixes");
693
0
    if (config.have("encoding")) {
694
0
      ConfigConvKey enc = config.retrieve_value("encoding");
695
0
      RET_ON_ERR(iconv.setup(config, enc, lang->charmap(),NormFrom));
696
0
    } else {
697
0
      RET_ON_ERR(iconv.setup(config, lang->data_encoding(), lang->charmap(), NormFrom));
698
0
    }
699
0
    return no_err;
700
0
  }
701
702
  PosibErr<void> WordListIterator::init_plain(Config & config)
703
8
  {
704
8
    if (!config.have("norm-strict"))
705
8
      config.replace("norm-strict", "true");
706
8
    have_affix = false;
707
8
    validate_words = config.retrieve_bool("validate-words");
708
8
    clean_words = true;
709
8
    if (config.have("clean-words"))
710
0
      clean_words = config.retrieve_bool("clean-words");
711
8
    skip_invalid_words = true;
712
8
    RET_ON_ERR(iconv.setup(config, "utf-8", lang->charmap(),NormFrom));
713
8
    return no_err;
714
8
  }
715
 
716
  PosibErr<bool> WordListIterator::adv() 
717
8
  {
718
8
  loop:
719
8
    if (!str) {
720
8
      orig = in->next();
721
8
      if (!orig) return false;
722
0
      if (!*orig) goto loop;
723
0
      PosibErr<const char *> pe = iconv(orig);
724
0
      if (pe.has_err()) {
725
0
        if (!skip_invalid_words) return pe;
726
0
        if (log) log->printf(_("Warning: %s Skipping string.\n"), pe.get_err()->mesg);
727
0
        else pe.ignore_err();
728
0
        goto loop;
729
0
      }
730
0
      if (pe.data == orig) {
731
0
        data = orig;
732
0
        data.ensure_null_end();
733
0
        str = data.pbegin();
734
0
        str_end = data.pend();
735
0
      } else {
736
0
        str = iconv.buf.pbegin();
737
0
        str_end = iconv.buf.pend();
738
0
      }
739
0
      char * aff = str_end;
740
0
      char * aff_end = str_end;
741
0
      if (have_affix) {
742
0
        aff = strchr(str, '/');
743
0
        if (aff == 0) {
744
0
          aff = str_end;
745
0
        } else {
746
0
          *aff = '\0';
747
0
          str_end = aff;
748
0
          ++aff;
749
0
        }
750
0
        if (validate_affixes) {
751
0
          if (clean_affixes)
752
0
            aff_end = clean_affix(str, aff);
753
0
          else
754
0
            RET_ON_ERR(validate_affix(*lang, str, aff));
755
0
        }
756
0
      }
757
0
      val.aff.str = aff;
758
0
      val.aff.size = aff_end - aff;
759
0
      if (!*aff && validate_words && clean_words) {
760
0
        char * s = str;
761
0
        while (s < str_end && !lang->is_alpha(*s) && !lang->special(*s).begin)
762
0
          *s++ = '\0';
763
0
        char * s2 = str_end - 1;
764
0
        while (s2 >= str && *s2 && !lang->is_alpha(*s2) && !lang->special(*s2).end)
765
0
          *s2-- = '\0';
766
0
      }
767
0
    }
768
0
    while (str < str_end) 
769
0
    {
770
0
      if (!*str) {++str; continue;}
771
772
0
      PosibErrBase pe2 = validate_words ? check_if_valid(*lang, str) : no_err;
773
774
0
      val.word.str = str;
775
0
      val.word.size = strlen(str);
776
0
      str += val.word.size + 1;
777
778
0
      if (!pe2.has_err() && val.word.size + (*val.aff ? val.aff.size + 1 : 0) > 240)
779
0
        pe2 = make_err(invalid_word, MsgConv(lang)(val.word),
780
0
                       _("The total length is larger than 240 characters."));
781
782
0
      if (!pe2.has_err()) return true;
783
0
      if (!skip_invalid_words) return pe2;
784
0
      if (log) log->printf(_("Warning: %s Skipping word.\n"), pe2.get_err()->mesg);
785
0
      else pe2.ignore_err();
786
0
    } 
787
0
    str = 0;
788
0
    goto loop;
789
0
  }
790
}