Coverage Report

Created: 2025-11-24 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/common/convert.cpp
Line
Count
Source
1
// This file is part of The New Aspell
2
// Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license
3
// version 2.0 or 2.1.  You should have received a copy of the LGPL
4
// license along with this library if you did not you can find
5
// it at http://www.gnu.org/.
6
7
#include <assert.h>
8
#include <string.h>
9
#include <math.h>
10
11
#include "asc_ctype.hpp"
12
#include "convert.hpp"
13
#include "fstream.hpp"
14
#include "getdata.hpp"
15
#include "config.hpp"
16
#include "errors.hpp"
17
#include "stack_ptr.hpp"
18
#include "cache-t.hpp"
19
#include "file_util.hpp"
20
#include "file_data_util.hpp"
21
#include "vararray.hpp"
22
23
#include "iostream.hpp"
24
25
#include "gettext.h"
26
27
namespace acommon {
28
29
  typedef unsigned char  byte;
30
  typedef unsigned char  Uni8;
31
  typedef unsigned short Uni16;
32
  typedef unsigned int   Uni32;
33
34
35
  //////////////////////////////////////////////////////////////////////
36
  //////////////////////////////////////////////////////////////////////
37
  //
38
  // Lookups
39
  //
40
  //////////////////////////////////////////////////////////////////////
41
  //////////////////////////////////////////////////////////////////////
42
43
  //////////////////////////////////////////////////////////////////////
44
  //
45
  // ToUniLookup
46
  //
47
48
  class ToUniLookup 
49
  {
50
    Uni32 data[256];
51
    static const Uni32 npos = (Uni32)(-1);
52
  public:
53
    void reset();
54
245k
    Uni32 operator[] (char key) const {return data[(unsigned char)key];}
55
0
    bool have(char key) const {return data[(unsigned char)key] != npos;}
56
    bool insert(char key, Uni32 value);
57
  };
58
59
  void ToUniLookup::reset() 
60
32
  {
61
8.22k
    for (int i = 0; i != 256; ++i)
62
8.19k
      data[i] = npos;
63
32
  }
64
65
  bool ToUniLookup::insert(char key, Uni32 value)
66
5.63k
  {
67
5.63k
    if (data[(unsigned char)key] != npos) 
68
0
      return false;
69
5.63k
    data[(unsigned char)key] = value;
70
5.63k
    return true;
71
5.63k
  }
72
73
  //////////////////////////////////////////////////////////////////////
74
  //
75
  // FromUniLookup
76
  //
77
78
  // Assumes that the maximum number of items in the table is 256
79
  // Also assumes (unsigned char)i == i % 256
80
81
  // Based on the iso-8859-* character sets it is very fast, almost all
82
  // lookups involving no more than 2 comparisons.
83
  // NO looks ups involded more than 3 compassions.
84
  // Also, no division (or modules) is done whatsoever.
85
86
87
  struct UniItem {
88
    Uni32 key;
89
    char  value;
90
  };
91
92
  class FromUniLookup 
93
  {
94
  private:
95
    static const Uni32 npos = (Uni32)(-1);
96
    UniItem * overflow_end;
97
  
98
    UniItem data[256*4];
99
100
    UniItem overflow[256]; // you can never be too careful;
101
  
102
  public:
103
979
    FromUniLookup() {}
104
    void reset();
105
    inline char operator() (Uni32 key, char unknown = '?') const;
106
    bool insert(Uni32 key, char value);
107
  };
108
109
  void FromUniLookup::reset()
110
32
  {
111
32.8k
    for (unsigned i = 0; i != 256*4; ++i)
112
32.7k
      data[i].key = npos;
113
32
    overflow_end = overflow;
114
32
  }
115
116
  inline char FromUniLookup::operator() (Uni32 k, char unknown) const
117
31.2k
  {
118
31.2k
    const UniItem * i = data + (unsigned char)k * 4;
119
120
31.2k
    if (i->key == k) return i->value;
121
0
    ++i;
122
0
    if (i->key == k) return i->value;
123
0
    ++i;
124
0
    if (i->key == k) return i->value;
125
0
    ++i;
126
0
    if (i->key == k) return i->value;
127
  
128
0
    if (i->key == npos) return unknown;
129
  
130
0
    for(i = overflow; i != overflow_end; ++i)
131
0
      if (i->key == k) return i->value;
132
133
0
    return unknown;
134
0
  }
135
136
  bool FromUniLookup::insert(Uni32 k, char v) 
137
5.63k
  {
138
5.63k
    UniItem * i = data + (unsigned char)k * 4;
139
5.63k
    UniItem * e = i + 4;
140
7.21k
    while (i != e && i->key != npos) {
141
1.58k
      if (i->key == k)
142
0
        return false;
143
1.58k
      ++i;
144
1.58k
    }
145
5.63k
    if (i == e) {
146
0
      for(i = overflow; i != overflow_end; ++i)
147
0
        if (i->key == k) return false;
148
0
    }
149
5.63k
    i->key = k;
150
5.63k
    i->value = v;
151
5.63k
    return true;
152
5.63k
  }
153
154
  //////////////////////////////////////////////////////////////////////
155
  //
156
  // CharLookup
157
  //
158
159
  class CharLookup 
160
  {
161
  private:
162
    int data[256];
163
  public:
164
    void reset();
165
0
    char operator[] (char key) const {return data[(unsigned char)key];}
166
    bool insert(char key, char value);
167
  };
168
169
0
  void CharLookup::reset() {
170
0
    for (int i = 0; i != 256; ++i) 
171
0
      data[i] = -1;
172
0
  }
173
174
  bool CharLookup::insert(char key, char value) 
175
0
  {
176
0
    if (data[(unsigned char)key] != -1)
177
0
      return false;
178
0
    data[(unsigned char)key] = value;
179
0
    return true;
180
0
  }
181
182
  //////////////////////////////////////////////////////////////////////
183
  //
184
  // NormLookup
185
  //
186
187
  template <class T>
188
  struct NormTable
189
  {
190
    static const unsigned struct_size;
191
    unsigned mask;
192
    unsigned height;
193
    unsigned width;
194
    unsigned size;
195
    T * end;
196
    T data[1]; // hack for data[]
197
  };
198
199
  template <class T>
200
  const unsigned NormTable<T>::struct_size = sizeof(NormTable<T>) - 1;
201
202
  template <class T, class From>
203
  struct NormLookupRet
204
  {
205
    const typename T::To   * to;
206
    const From * last;
207
    NormLookupRet(const typename T::To * t, From * l) 
208
35.2M
      : to(t), last(l) {}
acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const>::NormLookupRet(unsigned char const*, acommon::FilterChar const*)
Line
Count
Source
208
12.7M
      : to(t), last(l) {}
acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar>::NormLookupRet(unsigned char const*, acommon::FilterChar*)
Line
Count
Source
208
20.3M
      : to(t), last(l) {}
acommon::NormLookupRet<acommon::ToUniNormEntry, char const>::NormLookupRet(unsigned short const*, char const*)
Line
Count
Source
208
2.09M
      : to(t), last(l) {}
209
  };
210
  
211
  template <class T, class From>
212
  static inline NormLookupRet<T,From> norm_lookup(const NormTable<T> * d, 
213
                                                  From * s, From * stop,
214
                                                  const typename T::To * def,
215
                                                  From * prev) 
216
35.2M
  {
217
46.8M
  loop:
218
46.8M
    if (s != stop) {
219
46.8M
      const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask);
220
53.9M
      for (;;) {
221
53.9M
        if (i->from == static_cast<typename T::From>(*s)) {
222
35.2M
          if (i->sub_table) {
223
            // really tail recursion
224
11.6M
            if (i->to[1] != T::to_non_char) {def = i->to; prev = s;}
225
11.6M
            d = (const NormTable<T> *)(i->sub_table);
226
11.6M
            s++;
227
11.6M
            goto loop;
228
23.5M
          } else {
229
23.5M
            return NormLookupRet<T,From>(i->to, s);
230
23.5M
          }
231
35.2M
        } else {
232
18.6M
          i += d->height;
233
18.6M
          if (i >= d->end) break;
234
18.6M
        }
235
53.9M
      }
236
46.8M
    }
237
11.6M
    return NormLookupRet<T,From>(def, prev);
238
46.8M
  }
convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar const>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar const*, acommon::FilterChar const*, acommon::FromUniNormEntry::To const*, acommon::FilterChar const*)
Line
Count
Source
216
12.7M
  {
217
18.2M
  loop:
218
18.2M
    if (s != stop) {
219
18.2M
      const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask);
220
21.5M
      for (;;) {
221
21.5M
        if (i->from == static_cast<typename T::From>(*s)) {
222
12.7M
          if (i->sub_table) {
223
            // really tail recursion
224
5.51M
            if (i->to[1] != T::to_non_char) {def = i->to; prev = s;}
225
5.51M
            d = (const NormTable<T> *)(i->sub_table);
226
5.51M
            s++;
227
5.51M
            goto loop;
228
7.25M
          } else {
229
7.25M
            return NormLookupRet<T,From>(i->to, s);
230
7.25M
          }
231
12.7M
        } else {
232
8.83M
          i += d->height;
233
8.83M
          if (i >= d->end) break;
234
8.83M
        }
235
21.5M
      }
236
18.2M
    }
237
5.51M
    return NormLookupRet<T,From>(def, prev);
238
18.2M
  }
convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar*, acommon::FilterChar*, acommon::FromUniNormEntry::To const*, acommon::FilterChar*)
Line
Count
Source
216
20.3M
  {
217
26.4M
  loop:
218
26.4M
    if (s != stop) {
219
26.4M
      const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask);
220
30.2M
      for (;;) {
221
30.2M
        if (i->from == static_cast<typename T::From>(*s)) {
222
20.3M
          if (i->sub_table) {
223
            // really tail recursion
224
6.09M
            if (i->to[1] != T::to_non_char) {def = i->to; prev = s;}
225
6.09M
            d = (const NormTable<T> *)(i->sub_table);
226
6.09M
            s++;
227
6.09M
            goto loop;
228
14.2M
          } else {
229
14.2M
            return NormLookupRet<T,From>(i->to, s);
230
14.2M
          }
231
20.3M
        } else {
232
9.86M
          i += d->height;
233
9.86M
          if (i >= d->end) break;
234
9.86M
        }
235
30.2M
      }
236
26.4M
    }
237
6.13M
    return NormLookupRet<T,From>(def, prev);
238
26.4M
  }
convert.cpp:acommon::NormLookupRet<acommon::ToUniNormEntry, char const> acommon::norm_lookup<acommon::ToUniNormEntry, char const>(acommon::NormTable<acommon::ToUniNormEntry> const*, char const*, char const*, acommon::ToUniNormEntry::To const*, char const*)
Line
Count
Source
216
2.09M
  {
217
2.09M
  loop:
218
2.09M
    if (s != stop) {
219
2.09M
      const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask);
220
2.09M
      for (;;) {
221
2.09M
        if (i->from == static_cast<typename T::From>(*s)) {
222
2.09M
          if (i->sub_table) {
223
            // really tail recursion
224
0
            if (i->to[1] != T::to_non_char) {def = i->to; prev = s;}
225
0
            d = (const NormTable<T> *)(i->sub_table);
226
0
            s++;
227
0
            goto loop;
228
2.09M
          } else {
229
2.09M
            return NormLookupRet<T,From>(i->to, s);
230
2.09M
          }
231
2.09M
        } else {
232
0
          i += d->height;
233
0
          if (i >= d->end) break;
234
0
        }
235
2.09M
      }
236
2.09M
    }
237
0
    return NormLookupRet<T,From>(def, prev);
238
2.09M
  }
239
240
  template <class T>
241
  void free_norm_table(NormTable<T> * d)
242
31.6k
  {
243
4.73M
    for (T * cur = d->data; cur != d->end; ++cur) {
244
4.70M
      if (cur->sub_table) 
245
28.0k
        free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table));
246
4.70M
    }
247
31.6k
    free(d);
248
31.6k
  }
void acommon::free_norm_table<acommon::FromUniNormEntry>(acommon::NormTable<acommon::FromUniNormEntry>*)
Line
Count
Source
242
29.8k
  {
243
4.28M
    for (T * cur = d->data; cur != d->end; ++cur) {
244
4.25M
      if (cur->sub_table) 
245
28.0k
        free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table));
246
4.25M
    }
247
29.8k
    free(d);
248
29.8k
  }
void acommon::free_norm_table<acommon::ToUniNormEntry>(acommon::NormTable<acommon::ToUniNormEntry>*)
Line
Count
Source
242
1.75k
  {
243
451k
    for (T * cur = d->data; cur != d->end; ++cur) {
244
449k
      if (cur->sub_table) 
245
0
        free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table));
246
449k
    }
247
1.75k
    free(d);
248
1.75k
  }
249
250
  struct FromUniNormEntry
251
  {
252
    typedef Uni32 From;
253
    Uni32 from;
254
    typedef byte To;
255
    byte  to[4];
256
    static const From from_non_char = (From)(-1);
257
    static const To   to_non_char   = 0x10;
258
    static const unsigned max_to = 4;
259
    void * sub_table;
260
  } 
261
#ifdef __GNUC__    
262
    __attribute__ ((aligned (16)))
263
#endif
264
  ;
265
266
  struct ToUniNormEntry
267
  {
268
    typedef byte From;
269
    byte from;
270
    typedef Uni16 To;
271
    Uni16 to[3];
272
    static const From from_non_char = 0x10;
273
    static const To   to_non_char   = 0x10;
274
    static const unsigned max_to = 3;
275
    void * sub_table;
276
  } 
277
#ifdef __GNUC__    
278
    __attribute__ ((aligned (16)))
279
#endif
280
  ;
281
  
282
  //////////////////////////////////////////////////////////////////////
283
  //
284
  // read in char data
285
  //
286
287
  PosibErr<void> read_in_char_data (const Config & config,
288
                                    ParmStr encoding,
289
                                    ToUniLookup & to,
290
                                    FromUniLookup & from)
291
32
  {
292
32
    to.reset();
293
32
    from.reset();
294
    
295
32
    String dir1,dir2,file_name;
296
32
    fill_data_dir(&config, dir1, dir2);
297
32
    find_file(file_name,dir1,dir2,encoding,".cset");
298
299
32
    FStream data;
300
32
    PosibErrBase err = data.open(file_name, "r");
301
32
    if (err.get_err()) { 
302
10
      char mesg[300];
303
10
      snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."),
304
10
               file_name.c_str());
305
10
      return make_err(unknown_encoding, encoding, mesg);
306
10
    }
307
22
    unsigned chr;
308
22
    Uni32 uni;
309
22
    String line;
310
22
    char * p;
311
66
    do {
312
66
      p = get_nb_line(data, line);
313
66
    } while (*p != '/');
314
5.65k
    for (chr = 0; chr != 256; ++chr) {
315
5.63k
      p = get_nb_line(data, line);
316
5.63k
      if (strtoul(p, 0, 16) != chr)
317
0
        return make_err(bad_file_format, file_name);
318
5.63k
      uni = strtoul(p + 3, 0, 16);
319
5.63k
      to.insert(chr, uni);
320
5.63k
      from.insert(uni, chr);
321
5.63k
    }
322
  
323
22
    return no_err;
324
22
  }
325
326
  //////////////////////////////////////////////////////////////////////
327
  //
328
  // read in norm data
329
  //
330
331
  struct Tally 
332
  {
333
    int size;
334
    Uni32 mask;
335
    int max;
336
    int * data;
337
94.8k
    Tally(int s, int * d) : size(s), mask(s - 1), max(0), data(d) {
338
94.8k
      memset(data, 0, sizeof(int)*size);
339
94.8k
    }
340
4.83M
    void add(Uni32 chr) {
341
4.83M
      Uni32 p = chr & mask;
342
4.83M
      data[p]++;
343
4.83M
      if (data[p] > max) max = data[p];
344
4.83M
    }
345
  };
346
347
# define sanity(check) \
348
8.28M
    if (!(check)) return sanity_fail(__FILE__, FUNC, __LINE__, #check)
349
350
  static PosibErrBase sanity_fail(const char * file, const char * func, 
351
                                  unsigned line, const char * check_str) 
352
0
  {
353
0
    char mesg[500];
354
0
    snprintf(mesg, 500, "%s:%d: %s: Assertion \"%s\" failed.",
355
0
             file,  line, func, check_str);
356
0
    return make_err(bad_input_error, mesg);
357
0
  }
358
# define CREATE_NORM_TABLE(T, in, buf, res) \
359
31.6k
  do { PosibErr<NormTable<T> *> pe( create_norm_table<T>(in,buf) );\
360
31.6k
       if (pe.has_err()) return PosibErrBase(pe); \
361
31.6k
       res = pe.data; } while(false)
362
363
  template <class T>
364
  static PosibErr< NormTable<T> * > create_norm_table(IStream & in, String & buf)
365
31.6k
  {
366
31.6k
    const char FUNC[] = "create_norm_table";
367
31.6k
    const char * p = get_nb_line(in, buf);
368
31.6k
    sanity(*p == 'N');
369
31.6k
    ++p;
370
31.6k
    int size = strtoul(p, (char **)&p, 10);
371
31.6k
    VARARRAY(T, d, size);
372
31.6k
    memset(d, 0, sizeof(T) * size);
373
31.6k
    int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0));
374
31.6k
    VARARRAY(int, tally0_d, sz);   Tally tally0(sz,   tally0_d);
375
31.6k
    VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d);
376
31.6k
    VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d);
377
31.6k
    T * cur = d;
378
1.64M
    while (p = get_nb_line(in, buf), *p != '.') {
379
1.61M
      Uni32 f = strtoul(p, (char **)&p, 16);
380
1.61M
      cur->from = static_cast<typename T::From>(f);
381
1.61M
      sanity(f == cur->from);
382
1.61M
      tally0.add(f);
383
1.61M
      tally1.add(f);
384
1.61M
      tally2.add(f);
385
1.61M
      ++p;
386
1.61M
      sanity(*p == '>');
387
1.61M
      ++p;
388
1.61M
      sanity(*p == ' ');
389
1.61M
      ++p;
390
1.61M
      unsigned i = 0;
391
1.61M
      if (*p != '-') {
392
3.30M
        for (;; ++i) {
393
3.30M
          const char * q = p;
394
3.30M
          Uni32 t = strtoul(p, (char **)&p, 16);
395
3.30M
          if (q == p) break;
396
1.68M
          sanity(i < d->max_to);
397
1.68M
          cur->to[i] = static_cast<typename T::To>(t);
398
1.68M
          sanity(t == static_cast<Uni32>(cur->to[i]));
399
1.68M
        } 
400
1.61M
      } else {
401
0
        cur->to[0] = 0;
402
0
        cur->to[1] = T::to_non_char;
403
0
      }
404
1.61M
      if (*p == ' ') ++p;
405
1.61M
      if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table);
406
1.61M
      ++cur;
407
1.61M
    }
408
31.6k
    sanity(cur - d == size);
409
31.6k
    Tally * which = &tally0;
410
31.6k
    if (which->max > tally1.max) which = &tally1;
411
31.6k
    if (which->max > tally2.max) which = &tally2;
412
31.6k
    NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + 
413
31.6k
                                                  sizeof(T) * which->size * which->max);
414
31.6k
    memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max);
415
31.6k
    final->mask = which->size - 1;
416
31.6k
    final->height = which->size;
417
31.6k
    final->width = which->max;
418
31.6k
    final->end = final->data + which->size * which->max;
419
31.6k
    final->size = size;
420
1.64M
    for (cur = d; cur != d + size; ++cur) {
421
1.61M
      T * dest = final->data + (cur->from & final->mask);
422
1.67M
      while (dest->from != 0) dest += final->height;
423
1.61M
      *dest = *cur;
424
1.61M
      if (dest->from == 0) dest->from = T::from_non_char;
425
1.61M
    }
426
74.6k
    for (T * dest = final->data; dest < final->end; dest += final->height) {
427
43.0k
      if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) {
428
3.51k
        dest->from = T::from_non_char;
429
3.51k
        dest->to[0] = T::to_non_char;
430
3.51k
      }
431
43.0k
    }
432
31.6k
    return final;
433
31.6k
  }
convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::FromUniNormEntry>*> acommon::create_norm_table<acommon::FromUniNormEntry>(acommon::IStream&, acommon::String&)
Line
Count
Source
365
29.8k
  {
366
29.8k
    const char FUNC[] = "create_norm_table";
367
29.8k
    const char * p = get_nb_line(in, buf);
368
29.8k
    sanity(*p == 'N');
369
29.8k
    ++p;
370
29.8k
    int size = strtoul(p, (char **)&p, 10);
371
29.8k
    VARARRAY(T, d, size);
372
29.8k
    memset(d, 0, sizeof(T) * size);
373
29.8k
    int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0));
374
29.8k
    VARARRAY(int, tally0_d, sz);   Tally tally0(sz,   tally0_d);
375
29.8k
    VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d);
376
29.8k
    VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d);
377
29.8k
    T * cur = d;
378
1.19M
    while (p = get_nb_line(in, buf), *p != '.') {
379
1.16M
      Uni32 f = strtoul(p, (char **)&p, 16);
380
1.16M
      cur->from = static_cast<typename T::From>(f);
381
1.16M
      sanity(f == cur->from);
382
1.16M
      tally0.add(f);
383
1.16M
      tally1.add(f);
384
1.16M
      tally2.add(f);
385
1.16M
      ++p;
386
1.16M
      sanity(*p == '>');
387
1.16M
      ++p;
388
1.16M
      sanity(*p == ' ');
389
1.16M
      ++p;
390
1.16M
      unsigned i = 0;
391
1.16M
      if (*p != '-') {
392
2.35M
        for (;; ++i) {
393
2.35M
          const char * q = p;
394
2.35M
          Uni32 t = strtoul(p, (char **)&p, 16);
395
2.35M
          if (q == p) break;
396
1.19M
          sanity(i < d->max_to);
397
1.19M
          cur->to[i] = static_cast<typename T::To>(t);
398
1.19M
          sanity(t == static_cast<Uni32>(cur->to[i]));
399
1.19M
        } 
400
1.16M
      } else {
401
0
        cur->to[0] = 0;
402
0
        cur->to[1] = T::to_non_char;
403
0
      }
404
1.16M
      if (*p == ' ') ++p;
405
1.16M
      if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table);
406
1.16M
      ++cur;
407
1.16M
    }
408
29.8k
    sanity(cur - d == size);
409
29.8k
    Tally * which = &tally0;
410
29.8k
    if (which->max > tally1.max) which = &tally1;
411
29.8k
    if (which->max > tally2.max) which = &tally2;
412
29.8k
    NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + 
413
29.8k
                                                  sizeof(T) * which->size * which->max);
414
29.8k
    memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max);
415
29.8k
    final->mask = which->size - 1;
416
29.8k
    final->height = which->size;
417
29.8k
    final->width = which->max;
418
29.8k
    final->end = final->data + which->size * which->max;
419
29.8k
    final->size = size;
420
1.19M
    for (cur = d; cur != d + size; ++cur) {
421
1.16M
      T * dest = final->data + (cur->from & final->mask);
422
1.22M
      while (dest->from != 0) dest += final->height;
423
1.16M
      *dest = *cur;
424
1.16M
      if (dest->from == 0) dest->from = T::from_non_char;
425
1.16M
    }
426
71.1k
    for (T * dest = final->data; dest < final->end; dest += final->height) {
427
41.2k
      if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) {
428
1.75k
        dest->from = T::from_non_char;
429
1.75k
        dest->to[0] = T::to_non_char;
430
1.75k
      }
431
41.2k
    }
432
29.8k
    return final;
433
29.8k
  }
convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::ToUniNormEntry>*> acommon::create_norm_table<acommon::ToUniNormEntry>(acommon::IStream&, acommon::String&)
Line
Count
Source
365
1.75k
  {
366
1.75k
    const char FUNC[] = "create_norm_table";
367
1.75k
    const char * p = get_nb_line(in, buf);
368
1.75k
    sanity(*p == 'N');
369
1.75k
    ++p;
370
1.75k
    int size = strtoul(p, (char **)&p, 10);
371
1.75k
    VARARRAY(T, d, size);
372
1.75k
    memset(d, 0, sizeof(T) * size);
373
1.75k
    int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0));
374
1.75k
    VARARRAY(int, tally0_d, sz);   Tally tally0(sz,   tally0_d);
375
1.75k
    VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d);
376
1.75k
    VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d);
377
1.75k
    T * cur = d;
378
451k
    while (p = get_nb_line(in, buf), *p != '.') {
379
449k
      Uni32 f = strtoul(p, (char **)&p, 16);
380
449k
      cur->from = static_cast<typename T::From>(f);
381
449k
      sanity(f == cur->from);
382
449k
      tally0.add(f);
383
449k
      tally1.add(f);
384
449k
      tally2.add(f);
385
449k
      ++p;
386
449k
      sanity(*p == '>');
387
449k
      ++p;
388
449k
      sanity(*p == ' ');
389
449k
      ++p;
390
449k
      unsigned i = 0;
391
449k
      if (*p != '-') {
392
945k
        for (;; ++i) {
393
945k
          const char * q = p;
394
945k
          Uni32 t = strtoul(p, (char **)&p, 16);
395
945k
          if (q == p) break;
396
496k
          sanity(i < d->max_to);
397
496k
          cur->to[i] = static_cast<typename T::To>(t);
398
496k
          sanity(t == static_cast<Uni32>(cur->to[i]));
399
496k
        } 
400
449k
      } else {
401
0
        cur->to[0] = 0;
402
0
        cur->to[1] = T::to_non_char;
403
0
      }
404
449k
      if (*p == ' ') ++p;
405
449k
      if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table);
406
449k
      ++cur;
407
449k
    }
408
1.75k
    sanity(cur - d == size);
409
1.75k
    Tally * which = &tally0;
410
1.75k
    if (which->max > tally1.max) which = &tally1;
411
1.75k
    if (which->max > tally2.max) which = &tally2;
412
1.75k
    NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + 
413
1.75k
                                                  sizeof(T) * which->size * which->max);
414
1.75k
    memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max);
415
1.75k
    final->mask = which->size - 1;
416
1.75k
    final->height = which->size;
417
1.75k
    final->width = which->max;
418
1.75k
    final->end = final->data + which->size * which->max;
419
1.75k
    final->size = size;
420
451k
    for (cur = d; cur != d + size; ++cur) {
421
449k
      T * dest = final->data + (cur->from & final->mask);
422
449k
      while (dest->from != 0) dest += final->height;
423
449k
      *dest = *cur;
424
449k
      if (dest->from == 0) dest->from = T::from_non_char;
425
449k
    }
426
3.51k
    for (T * dest = final->data; dest < final->end; dest += final->height) {
427
1.75k
      if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) {
428
1.75k
        dest->from = T::from_non_char;
429
1.75k
        dest->to[0] = T::to_non_char;
430
1.75k
      }
431
1.75k
    }
432
1.75k
    return final;
433
1.75k
  }
434
435
  static PosibErr<void> init_norm_tables(FStream & in, NormTables * d) 
436
878
  {
437
878
    const char FUNC[] = "init_norm_tables";
438
878
    String l;
439
878
    get_nb_line(in, l);
440
878
    remove_comments(l);
441
878
    sanity (l == "INTERNAL");
442
878
    get_nb_line(in, l);
443
878
    remove_comments(l);
444
878
    sanity (l == "/");
445
878
    CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->internal);
446
878
    get_nb_line(in, l);
447
878
    remove_comments(l);
448
878
    sanity (l == "STRICT");
449
878
    char * p = get_nb_line(in, l);
450
878
    remove_comments(l);
451
878
    if (l == "/") {
452
878
      CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->strict_d);
453
878
      d->strict = d->strict_d;
454
878
    } else {
455
0
      sanity(*p == '=');
456
0
      ++p; ++p;
457
0
      sanity(strcmp(p, "INTERNAL") == 0);
458
0
      d->strict = d->internal;
459
0
    }
460
3.51k
    while (get_nb_line(in, l)) {
461
2.63k
      remove_comments(l);
462
2.63k
      d->to_uni.push_back(NormTables::ToUniTable());
463
2.63k
      NormTables::ToUniTable & e = d->to_uni.back();
464
2.63k
      e.name.resize(l.size());
465
11.4k
      for (unsigned i = 0; i != l.size(); ++i)
466
8.78k
        e.name[i] = asc_tolower(l[i]);
467
2.63k
      char * p = get_nb_line(in, l);
468
2.63k
      remove_comments(l);
469
2.63k
      if (l == "/") {
470
1.75k
        CREATE_NORM_TABLE(ToUniNormEntry, in, l, e.data);
471
1.75k
        e.ptr = e.data;
472
1.75k
      } else {
473
878
        sanity(*p == '=');
474
878
        ++p; ++p;
475
3.51k
        for (char * q = p; *q; ++q) *q = asc_tolower(*q);
476
878
        Vector<NormTables::ToUniTable>::iterator i = d->to_uni.begin();
477
1.75k
        while (i->name != p && i != d->to_uni.end()) ++i;
478
878
        sanity(i != d->to_uni.end());
479
878
        e.ptr = i->ptr;
480
878
        get_nb_line(in, l);
481
878
      }
482
2.63k
    }  
483
878
    return no_err;
484
878
  }
485
486
  PosibErr<NormTables *> NormTables::get_new(const String & encoding, 
487
                                             const Config * config)
488
878
  {
489
878
    String dir1,dir2,file_name;
490
878
    fill_data_dir(config, dir1, dir2);
491
878
    find_file(file_name,dir1,dir2,encoding,".cmap");
492
    
493
878
    FStream in;
494
878
    PosibErrBase err = in.open(file_name, "r");
495
878
    if (err.get_err()) { 
496
0
      char mesg[300];
497
0
      snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."),
498
0
               file_name.c_str());
499
0
      return make_err(unknown_encoding, encoding, mesg); // FIXME
500
0
    }
501
502
878
    NormTables * d = new NormTables;
503
878
    d->key = encoding;
504
878
    err = init_norm_tables(in, d);
505
878
    if (err.has_err()) {
506
0
      return make_err(bad_file_format, file_name, err.get_err()->mesg);
507
0
    }
508
509
878
    return d;
510
511
878
  }
512
513
  NormTables::~NormTables()
514
878
  {
515
878
    free_norm_table<FromUniNormEntry>(internal);
516
878
    if (strict_d)
517
878
      free_norm_table<FromUniNormEntry>(strict_d);
518
3.51k
    for (unsigned i = 0; i != to_uni.size(); ++i) {
519
2.63k
      if (to_uni[i].data)
520
1.75k
        free_norm_table<ToUniNormEntry>(to_uni[i].data);
521
2.63k
    }
522
878
  }
523
524
  //////////////////////////////////////////////////////////////////////
525
  //////////////////////////////////////////////////////////////////////
526
  //
527
  //  Convert
528
  //
529
  //////////////////////////////////////////////////////////////////////
530
  //////////////////////////////////////////////////////////////////////
531
532
533
  bool operator== (const Convert & rhs, const Convert & lhs)
534
0
  {
535
0
    return strcmp(rhs.in_code(), lhs.in_code()) == 0
536
0
      && strcmp(rhs.out_code(), lhs.out_code()) == 0;
537
0
  }
538
539
  //////////////////////////////////////////////////////////////////////
540
  //
541
  // Trivial Conversion
542
  //
543
544
  const char * unsupported_null_term_wide_string_msg =
545
    "Null-terminated wide-character strings unsupported when used this way.";
546
547
  template <typename Chr>
548
  struct DecodeDirect : public Decode 
549
  {
550
967
    DecodeDirect() {type_width = sizeof(Chr);}
acommon::DecodeDirect<unsigned char>::DecodeDirect()
Line
Count
Source
550
828
    DecodeDirect() {type_width = sizeof(Chr);}
acommon::DecodeDirect<unsigned short>::DecodeDirect()
Line
Count
Source
550
26
    DecodeDirect() {type_width = sizeof(Chr);}
acommon::DecodeDirect<unsigned int>::DecodeDirect()
Line
Count
Source
550
113
    DecodeDirect() {type_width = sizeof(Chr);}
551
221k
    void decode(const char * in0, int size, FilterCharVector & out) const {
552
221k
      const Chr * in = reinterpret_cast<const Chr *>(in0);
553
221k
      if (size == -sizeof(Chr)) {
554
761k
        for (;*in; ++in)
555
554k
          out.append(*in, sizeof(Chr));
556
207k
      } else if (size <= -1) {
557
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
558
0
        abort();
559
14.7k
      } else {
560
14.7k
        const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr);
561
36.2M
        for (;in != stop; ++in)
562
36.2M
          out.append(*in, sizeof(Chr));
563
14.7k
      }
564
221k
    }
acommon::DecodeDirect<unsigned char>::decode(char const*, int, acommon::FilterCharVector&) const
Line
Count
Source
551
219k
    void decode(const char * in0, int size, FilterCharVector & out) const {
552
219k
      const Chr * in = reinterpret_cast<const Chr *>(in0);
553
219k
      if (size == -sizeof(Chr)) {
554
761k
        for (;*in; ++in)
555
554k
          out.append(*in, sizeof(Chr));
556
207k
      } else if (size <= -1) {
557
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
558
0
        abort();
559
12.2k
      } else {
560
12.2k
        const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr);
561
35.7M
        for (;in != stop; ++in)
562
35.7M
          out.append(*in, sizeof(Chr));
563
12.2k
      }
564
219k
    }
acommon::DecodeDirect<unsigned short>::decode(char const*, int, acommon::FilterCharVector&) const
Line
Count
Source
551
1.42k
    void decode(const char * in0, int size, FilterCharVector & out) const {
552
1.42k
      const Chr * in = reinterpret_cast<const Chr *>(in0);
553
1.42k
      if (size == -sizeof(Chr)) {
554
0
        for (;*in; ++in)
555
0
          out.append(*in, sizeof(Chr));
556
1.42k
      } else if (size <= -1) {
557
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
558
0
        abort();
559
1.42k
      } else {
560
1.42k
        const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr);
561
421k
        for (;in != stop; ++in)
562
419k
          out.append(*in, sizeof(Chr));
563
1.42k
      }
564
1.42k
    }
acommon::DecodeDirect<unsigned int>::decode(char const*, int, acommon::FilterCharVector&) const
Line
Count
Source
551
1.06k
    void decode(const char * in0, int size, FilterCharVector & out) const {
552
1.06k
      const Chr * in = reinterpret_cast<const Chr *>(in0);
553
1.06k
      if (size == -sizeof(Chr)) {
554
0
        for (;*in; ++in)
555
0
          out.append(*in, sizeof(Chr));
556
1.06k
      } else if (size <= -1) {
557
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
558
0
        abort();
559
1.06k
      } else {
560
1.06k
        const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr);
561
82.9k
        for (;in != stop; ++in)
562
81.9k
          out.append(*in, sizeof(Chr));
563
1.06k
      }
564
1.06k
    }
565
    PosibErr<void> decode_ec(const char * in0, int size, 
566
0
                             FilterCharVector & out, ParmStr) const {
567
0
      DecodeDirect::decode(in0, size, out);
568
0
      return no_err;
569
0
    }
Unexecuted instantiation: acommon::DecodeDirect<unsigned char>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const
Unexecuted instantiation: acommon::DecodeDirect<unsigned short>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const
Unexecuted instantiation: acommon::DecodeDirect<unsigned int>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const
570
  };
571
572
  template <typename Chr>
573
  struct EncodeDirect : public Encode
574
  {
575
1.01k
    EncodeDirect() {type_width = sizeof(Chr);}
acommon::EncodeDirect<unsigned char>::EncodeDirect()
Line
Count
Source
575
828
    EncodeDirect() {type_width = sizeof(Chr);}
acommon::EncodeDirect<unsigned short>::EncodeDirect()
Line
Count
Source
575
26
    EncodeDirect() {type_width = sizeof(Chr);}
acommon::EncodeDirect<unsigned int>::EncodeDirect()
Line
Count
Source
575
162
    EncodeDirect() {type_width = sizeof(Chr);}
576
    void encode(const FilterChar * in, const FilterChar * stop, 
577
770k
                CharVector & out) const {
578
3.56M
      for (; in != stop; ++in) {
579
2.79M
        Chr c = in->chr;
580
2.79M
        if (c != in->chr) c = '?';
581
2.79M
        out.append(&c, sizeof(Chr));
582
2.79M
      }
583
770k
    }
acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const
Line
Count
Source
577
563k
                CharVector & out) const {
578
2.80M
      for (; in != stop; ++in) {
579
2.24M
        Chr c = in->chr;
580
2.24M
        if (c != in->chr) c = '?';
581
2.24M
        out.append(&c, sizeof(Chr));
582
2.24M
      }
583
563k
    }
acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const
Line
Count
Source
577
145k
                CharVector & out) const {
578
583k
      for (; in != stop; ++in) {
579
438k
        Chr c = in->chr;
580
438k
        if (c != in->chr) c = '?';
581
438k
        out.append(&c, sizeof(Chr));
582
438k
      }
583
145k
    }
acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const
Line
Count
Source
577
62.0k
                CharVector & out) const {
578
178k
      for (; in != stop; ++in) {
579
115k
        Chr c = in->chr;
580
115k
        if (c != in->chr) c = '?';
581
115k
        out.append(&c, sizeof(Chr));
582
115k
      }
583
62.0k
    }
584
    PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, 
585
4.58k
                             CharVector & out, ParmStr orig) const {
586
38.8k
      for (; in != stop; ++in) {
587
34.2k
        Chr c = in->chr;
588
34.2k
        if (c != in->chr) {
589
0
          char m[70];
590
0
          snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
591
0
          return make_err(invalid_string, orig, m);
592
0
        }
593
        
594
34.2k
        out.append(&c, sizeof(Chr));
595
34.2k
      }
596
4.58k
      return no_err;
597
4.58k
    }
acommon::EncodeDirect<unsigned char>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const
Line
Count
Source
585
1.50k
                             CharVector & out, ParmStr orig) const {
586
4.51k
      for (; in != stop; ++in) {
587
3.00k
        Chr c = in->chr;
588
3.00k
        if (c != in->chr) {
589
0
          char m[70];
590
0
          snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
591
0
          return make_err(invalid_string, orig, m);
592
0
        }
593
        
594
3.00k
        out.append(&c, sizeof(Chr));
595
3.00k
      }
596
1.50k
      return no_err;
597
1.50k
    }
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const
acommon::EncodeDirect<unsigned int>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const
Line
Count
Source
585
3.07k
                             CharVector & out, ParmStr orig) const {
586
34.3k
      for (; in != stop; ++in) {
587
31.2k
        Chr c = in->chr;
588
31.2k
        if (c != in->chr) {
589
0
          char m[70];
590
0
          snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
591
0
          return make_err(invalid_string, orig, m);
592
0
        }
593
        
594
31.2k
        out.append(&c, sizeof(Chr));
595
31.2k
      }
596
3.07k
      return no_err;
597
3.07k
    }
598
65
    bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const {
599
65
      return true;
600
65
    }
acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const
Line
Count
Source
598
65
    bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const {
599
65
      return true;
600
65
    }
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const
Unexecuted instantiation: acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const
601
  };
602
603
  template <typename Chr>
604
  struct ConvDirect : public DirectConv
605
  {
606
56
    ConvDirect() {type_width = sizeof(Chr);}
Unexecuted instantiation: acommon::ConvDirect<unsigned short>::ConvDirect()
Unexecuted instantiation: acommon::ConvDirect<unsigned int>::ConvDirect()
acommon::ConvDirect<char>::ConvDirect()
Line
Count
Source
606
56
    ConvDirect() {type_width = sizeof(Chr);}
607
41.7k
    void convert(const char * in0, int size, CharVector & out) const {
608
41.7k
      if (size == -sizeof(Chr)) {
609
40.8k
        const Chr * in = reinterpret_cast<const Chr *>(in0);
610
179k
        for (;*in != 0; ++in)
611
138k
          out.append(in, sizeof(Chr));
612
40.8k
      } else if (size <= -1) {
613
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
614
0
        abort();
615
914
      } else {
616
914
        out.append(in0, size);
617
914
      }
618
41.7k
    }
Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert(char const*, int, acommon::String&) const
Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert(char const*, int, acommon::String&) const
acommon::ConvDirect<char>::convert(char const*, int, acommon::String&) const
Line
Count
Source
607
41.7k
    void convert(const char * in0, int size, CharVector & out) const {
608
41.7k
      if (size == -sizeof(Chr)) {
609
40.8k
        const Chr * in = reinterpret_cast<const Chr *>(in0);
610
179k
        for (;*in != 0; ++in)
611
138k
          out.append(in, sizeof(Chr));
612
40.8k
      } else if (size <= -1) {
613
0
        fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
614
0
        abort();
615
914
      } else {
616
914
        out.append(in0, size);
617
914
      }
618
41.7k
    }
619
    PosibErr<void> convert_ec(const char * in0, int size, 
620
0
                              CharVector & out, ParmStr) const {
621
0
      ConvDirect::convert(in0, size, out);
622
0
      return no_err;
623
0
    }
Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const
Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const
Unexecuted instantiation: acommon::ConvDirect<char>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const
624
  };
625
626
  //////////////////////////////////////////////////////////////////////
627
  //
628
  //  Lookup Conversion
629
  //
630
631
  struct DecodeLookup : public Decode 
632
  {
633
    ToUniLookup lookup;
634
21
    PosibErr<void> init(ParmStr code, const Config & c) {
635
21
      FromUniLookup unused;
636
21
      return read_in_char_data(c, code, lookup, unused);
637
21
    }
638
791
    void decode(const char * in, int size, FilterCharVector & out) const {
639
791
      if (size == -1) {
640
0
        for (;*in; ++in)
641
0
          out.append(lookup[*in]);
642
791
      } else {
643
791
        const char * stop = in + size;
644
246k
        for (;in != stop; ++in)
645
245k
          out.append(lookup[*in]);
646
791
      }
647
791
    }
648
    PosibErr<void> decode_ec(const char * in, int size, 
649
0
                             FilterCharVector & out, ParmStr) const {
650
0
      DecodeLookup::decode(in, size, out);
651
0
      return no_err;
652
0
    }
653
  };
654
655
  struct DecodeNormLookup : public Decode 
656
  {
657
    typedef ToUniNormEntry E;
658
    NormTable<E> * data;
659
7.76k
    DecodeNormLookup(NormTable<E> * d) : data(d) {}
660
    // must be null terminated
661
    // FIXME: Why must it be null terminated?
662
568k
    void decode(const char * in, int size, FilterCharVector & out) const {
663
568k
      const char * stop = in + size; // will work even if size -1
664
2.66M
      while (in != stop) {
665
2.66M
        if (*in == 0) {
666
568k
          if (size == -1) break;
667
0
          out.append(0);
668
0
          ++in;
669
2.09M
        } else {
670
2.09M
          NormLookupRet<E,const char> ret = norm_lookup<E>(data, in, stop, 0, in);
671
4.19M
          for (unsigned i = 0; ret.to[i] && i < E::max_to; ++i)
672
2.09M
            out.append(ret.to[i]);
673
2.09M
          in = ret.last + 1;
674
2.09M
        }
675
2.66M
      }
676
568k
    }
677
    PosibErr<void> decode_ec(const char * in, int size, 
678
0
                             FilterCharVector & out, ParmStr) const {
679
0
      DecodeNormLookup::decode(in, size, out);
680
0
      return no_err;
681
0
    }
682
  };
683
684
  struct EncodeLookup : public Encode 
685
  {
686
    FromUniLookup lookup;
687
    PosibErr<void> init(ParmStr code, const Config & c) 
688
11
      {ToUniLookup unused;
689
11
      return read_in_char_data(c, code, unused, lookup);}
690
    void encode(const FilterChar * in, const FilterChar * stop, 
691
7.34k
                CharVector & out) const {
692
38.6k
      for (; in != stop; ++in) {
693
31.2k
        out.append(lookup(*in));
694
31.2k
      }
695
7.34k
    }
696
    PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, 
697
0
                             CharVector & out, ParmStr orig) const {
698
0
      for (; in != stop; ++in) {
699
0
        char c = lookup(*in, '\0');
700
0
        if (c == '\0' && in->chr != 0) {
701
0
          char m[70];
702
0
          snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
703
0
          return make_err(invalid_string, orig, m);
704
0
        }
705
0
        out.append(c);
706
0
      }
707
0
      return no_err;
708
0
    }
709
    bool encode(FilterChar * & in0, FilterChar * & stop,
710
0
                FilterCharVector & out) const {
711
0
      FilterChar * in = in0;
712
0
      for (; in != stop; ++in)
713
0
        *in = lookup(*in);
714
0
      return true;
715
0
    }
716
  };
717
718
  struct EncodeNormLookup : public Encode 
719
  {
720
    typedef FromUniNormEntry E;
721
    NormTable<E> * data;
722
2.61k
    EncodeNormLookup(NormTable<E> * d) : data(d) {}
723
    // *stop must equal 0
724
    void encode(const FilterChar * in, const FilterChar * stop, 
725
12.2k
                CharVector & out) const {
726
12.7M
      while (in < stop) {
727
12.7M
        if (*in == 0) {
728
0
          out.append('\0');
729
0
          ++in;
730
12.7M
        } else {
731
12.7M
          NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in);
732
25.4M
          for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i)
733
12.7M
            out.append(ret.to[i]);
734
12.7M
          in = ret.last + 1;
735
12.7M
        }
736
12.7M
      }
737
12.2k
    }
738
    PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, 
739
19.6k
                             CharVector & out, ParmStr orig) const {
740
58.9k
      while (in < stop) {
741
39.3k
        if (*in == 0) {
742
0
          out.append('\0');
743
0
          ++in;
744
39.3k
        } else {
745
39.3k
          NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, 0, in);
746
39.3k
          if (ret.to == 0) {
747
0
            char m[70];
748
0
            snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
749
0
            return make_err(invalid_string, orig, m);
750
0
          }
751
78.6k
          for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i)
752
39.3k
            out.append(ret.to[i]);
753
39.3k
          in = ret.last + 1;
754
39.3k
        }
755
39.3k
      }
756
19.6k
      return no_err;
757
19.6k
    }
758
    bool encode(FilterChar * & in, FilterChar * & stop,
759
845
                FilterCharVector & buf) const {
760
845
      buf.clear();
761
23.0M
      while (in < stop) {
762
23.0M
        if (*in == 0) {
763
2.65M
          buf.append(FilterChar(0));
764
2.65M
          ++in;
765
20.3M
        } else {
766
20.3M
          NormLookupRet<E,FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in);
767
20.3M
          const FilterChar * end = ret.last + 1;
768
20.3M
          unsigned width = 0;
769
40.7M
          for (; in != end; ++in) width += in->width;
770
20.3M
          buf.append(FilterChar(ret.to[0], width));
771
20.3M
          for (unsigned i = 1; i < E::max_to && ret.to[i]; ++i) {
772
1
            buf.append(FilterChar(ret.to[i],0));
773
1
          }
774
20.3M
        }
775
23.0M
      }
776
845
      buf.append(0);
777
845
      in = buf.pbegin();
778
845
      stop = buf.pend();
779
845
      return true;
780
845
    }
781
  };
782
783
  //////////////////////////////////////////////////////////////////////
784
  //
785
  //  UTF8
786
  //
787
  
788
#define get_check_next \
789
1.25k
  if (in == stop) goto error;          \
790
1.25k
  c = *in;                             \
791
1.25k
  if ((c & 0xC0/*1100 0000*/) != 0x80/*10xx xxxx*/) goto error;\
792
1.25k
  ++in;                                \
793
1.24k
  u <<= 6;                             \
794
1.24k
  u |= c & 0x3F/*0011 1111*/;          \
795
1.24k
  ++w;
796
797
  static inline FilterChar from_utf8 (const char * & in, const char * stop = 0,
798
                                      Uni32 err_char = '?')
799
73.6k
  {
800
73.6k
    Uni32 u = (Uni32)(-1);
801
73.6k
    FilterChar::Width w = 1;
802
803
    // the first char is guaranteed not to be off the end
804
73.6k
    char c = *in;
805
73.6k
    ++in;
806
807
73.6k
    if ((c & 0x80/*1000 0000*/) == 0x00/*0xxx xxx*/) {
808
73.0k
      u = c;
809
73.0k
    } else if ((c & 0xE0/*1110 0000*/) == 0xC0/*110x xxxx*/) { // 2-byte wide
810
68
      u  = c & 0x1F/*0001 1111*/;
811
136
      get_check_next;
812
526
    } else if ((c & 0xF0/*1111 0000*/) == 0xE0/*1110 xxxx*/) { // 3-byte wide
813
378
      u  = c & 0x0F/*0000 1111*/;
814
754
      get_check_next;
815
754
      get_check_next;
816
751
    } else if ((c & 0xF8/*1111 1000*/) == 0xF0/*1111 0xxx*/) { // 4-byte wide
817
145
      u  = c & 0x07/*0000 0111*/;
818
289
      get_check_next;
819
289
      get_check_next;
820
287
      get_check_next;
821
286
    } else {
822
3
      goto error;
823
3
    }
824
825
73.6k
    return FilterChar(u, w);
826
8
  error:
827
8
    return FilterChar(err_char, w);
828
73.6k
  }
829
830
  static inline void to_utf8 (FilterChar in, CharVector & out)
831
0
  {
832
0
    FilterChar::Chr c = in;
833
    
834
0
    if (c < 0x80) {
835
0
      out.append(c);
836
0
    }
837
0
    else if (c < 0x800) {
838
0
      out.append(0xC0 | (c>>6));
839
0
      out.append(0x80 | (c & 0x3F));
840
0
    }
841
0
    else if (c < 0x10000) {
842
0
      out.append(0xE0 | (c>>12));
843
0
      out.append(0x80 | (c>>6 & 0x3F));
844
0
      out.append(0x80 | (c & 0x3F));
845
0
    }
846
0
    else if (c < 0x200000) {
847
0
      out.append(0xF0 | (c>>18));
848
0
      out.append(0x80 | (c>>12 & 0x3F));
849
0
      out.append(0x80 | (c>>6 & 0x3F));
850
0
      out.append(0x80 | (c & 0x3F));
851
0
    }
852
0
  }
853
  
854
  struct DecodeUtf8 : public Decode 
855
  {
856
    ToUniLookup lookup;
857
0
    void decode(const char * in, int size, FilterCharVector & out) const {
858
0
      if (size == -1) {
859
0
        while (*in)
860
0
          out.append(from_utf8(in));
861
0
      } else {
862
0
        const char * stop = in + size;
863
0
        while (in != stop)
864
0
          out.append(from_utf8(in, stop));
865
0
      }
866
0
    }
867
    PosibErr<void> decode_ec(const char * in, int size, 
868
24.2k
                             FilterCharVector & out, ParmStr orig) const {
869
24.2k
      const char * begin = in;
870
24.2k
      if (size == -1) {
871
34.3k
        while (*in) {
872
31.3k
          FilterChar c = from_utf8(in, 0, (Uni32)-1);
873
31.3k
          if (c == (Uni32)-1) goto error;
874
31.3k
          out.append(c);
875
31.3k
        }
876
21.1k
      } else {
877
21.1k
        const char * stop = in + size;
878
63.4k
        while (in != stop) {
879
42.3k
          FilterChar c = from_utf8(in, stop, (Uni32)-1);
880
42.3k
          if (c == (Uni32)-1) goto error;
881
42.3k
          out.append(c);
882
42.3k
        }
883
21.1k
      }
884
24.2k
      return no_err;
885
8
    error:
886
8
      char m[70];
887
8
      snprintf(m, 70, _("Invalid UTF-8 sequence at position %ld."), (long)(in - begin));
888
8
      return make_err(invalid_string, orig, m);
889
24.2k
    }
890
  };
891
892
  struct EncodeUtf8 : public Encode 
893
  {
894
    FromUniLookup lookup;
895
    void encode(const FilterChar * in, const FilterChar * stop, 
896
0
                CharVector & out) const {
897
0
      for (; in != stop; ++in) {
898
0
        to_utf8(*in, out);
899
0
      }
900
0
    }
901
    PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, 
902
0
                             CharVector & out, ParmStr) const {
903
0
      for (; in != stop; ++in) {
904
0
        to_utf8(*in, out);
905
0
      }
906
0
      return no_err;
907
0
    }
908
  };
909
910
  //////////////////////////////////////////////////////////////////////
911
  //
912
  // Cache
913
  //
914
915
  static GlobalCache<Decode> decode_cache("decode");
916
  static GlobalCache<Encode> encode_cache("encode");
917
  static GlobalCache<NormTables> norm_tables_cache("norm_tables");
918
  
919
  //////////////////////////////////////////////////////////////////////
920
  //
921
  // new_aspell_convert
922
  //
923
924
  void Convert::generic_convert(const char * in, int size, CharVector & out)
925
1.32k
  {
926
1.32k
    buf_.clear();
927
1.32k
    decode_->decode(in, size, buf_);
928
1.32k
    FilterChar * start = buf_.pbegin();
929
1.32k
    FilterChar * stop = buf_.pend();
930
1.32k
    if (!filter.empty())
931
1.32k
      filter.process(start, stop);
932
1.32k
    encode_->encode(start, stop, out);
933
1.32k
  }
934
935
  const char * fix_encoding_str(ParmStr enc, String & buf)
936
40.4k
  {
937
40.4k
    buf.clear();
938
40.4k
    buf.reserve(enc.size() + 1);
939
414k
    for (size_t i = 0; i != enc.size(); ++i)
940
373k
      buf.push_back(asc_tolower(enc[i]));
941
942
40.4k
    if (strncmp(buf.c_str(), "iso8859", 7) == 0)
943
2.70k
      buf.insert(buf.begin() + 3, '-'); // For backwards compatibility
944
    
945
40.4k
    if (buf == "ascii" || buf == "ansi_x3.4-1968")
946
0
      return "iso-8859-1";
947
40.4k
    else if (buf == "machine unsigned 16" || buf == "utf-16")
948
52
      return "ucs-2";
949
40.4k
    else if (buf == "machine unsigned 32" || buf == "utf-32")
950
226
      return "ucs-4";
951
40.2k
    else
952
40.2k
      return buf.c_str();
953
40.4k
  }
954
955
  bool ascii_encoding(const Config & c, ParmStr enc0)
956
2.11k
  {
957
2.11k
    if (enc0.empty()) return true;
958
2.11k
    if (enc0 == "ANSI_X3.4-1968" 
959
2.11k
        || enc0 == "ASCII" || enc0 == "ascii") return true;
960
0
    String buf;
961
0
    const char * enc = fix_encoding_str(enc0, buf);
962
0
    if (strcmp(enc, "utf-8") == 0 
963
0
        || strcmp(enc, "ucs-2") == 0 
964
0
        || strcmp(enc, "ucs-4") == 0) return false;
965
0
    String dir1,dir2,file_name;
966
0
    fill_data_dir(&c, dir1, dir2);
967
0
    file_name << dir1 << enc << ".cset";
968
0
    if (file_exists(file_name)) return false;
969
0
    if (dir1 == dir2) return true;
970
0
    file_name.clear();
971
0
    file_name << dir2 << enc << ".cset";
972
0
    return !file_exists(file_name);
973
0
  }
974
975
  PosibErr<Convert *> internal_new_convert(const Config & c,
976
                                           ConvKey in, 
977
                                           ConvKey out,
978
                                           bool if_needed,
979
                                           Normalize norm)
980
18.8k
  {
981
18.8k
    String in_s;
982
18.8k
    in.val = fix_encoding_str(in.val, in_s);
983
984
18.8k
    String out_s;
985
18.8k
    out.val = fix_encoding_str(out.val, out_s); 
986
987
18.8k
    if (if_needed && in.val == out.val) return 0;
988
989
11.2k
    StackPtr<Convert> conv(new Convert);
990
11.2k
    switch (norm) {
991
54
    case NormNone:
992
54
      RET_ON_ERR(conv->init(c, in, out)); break;
993
2.82k
    case NormFrom:
994
2.82k
      RET_ON_ERR(conv->init_norm_from(c, in, out)); break;
995
8.37k
    case NormTo:
996
8.37k
      RET_ON_ERR(conv->init_norm_to(c, in, out)); break;
997
11.2k
    }
998
11.2k
    return conv.release();
999
11.2k
  }
1000
1001
  PosibErr<Decode *> Decode::get_new(const ConvKey & k, const Config * c)
1002
1.93k
  {
1003
1.93k
    StackPtr<Decode> ptr;
1004
1.93k
    if (k.val == "iso-8859-1") {
1005
828
      ptr.reset(new DecodeDirect<Uni8>);
1006
1.10k
    } else if (k.val == "ucs-2") {
1007
26
      if (k.allow_ucs)
1008
26
        ptr.reset(new DecodeDirect<Uni16>);
1009
0
      else
1010
0
        return make_err(encoding_not_supported, k.val);
1011
1.08k
    } else if (k.val == "ucs-4") {
1012
113
      if (k.allow_ucs)
1013
113
        ptr.reset(new DecodeDirect<Uni32>);
1014
0
      else
1015
0
        return make_err(encoding_not_supported, k.val);
1016
968
    } else if (k.val == "utf-8") {
1017
947
      ptr.reset(new DecodeUtf8);
1018
947
    } else {
1019
21
      ptr.reset(new DecodeLookup);
1020
21
    }
1021
1.93k
    RET_ON_ERR(ptr->init(k.val, *c));
1022
1.92k
    ptr->key = k.val;
1023
1.92k
    return ptr.release();
1024
1.93k
  }
1025
1026
  PosibErr<Encode *> Encode::get_new(const ConvKey & k, const Config * c)
1027
1.97k
  {
1028
1.97k
    StackPtr<Encode> ptr;
1029
1.97k
    if (k.val == "iso-8859-1") {
1030
828
      ptr.reset(new EncodeDirect<Uni8>);
1031
1.14k
    } else if (k.val == "ucs-2" && k.allow_ucs) {
1032
26
      if (k.allow_ucs)
1033
26
        ptr.reset(new EncodeDirect<Uni16>);
1034
0
      else
1035
0
        return make_err(encoding_not_supported, k.val);
1036
1.12k
    } else if (k.val == "ucs-4" && k.allow_ucs) {
1037
162
      if (k.allow_ucs)
1038
162
        ptr.reset(new EncodeDirect<Uni32>);
1039
0
      else
1040
0
        return make_err(encoding_not_supported, k.val);
1041
958
    } else if (k.val == "utf-8") {
1042
947
      ptr.reset(new EncodeUtf8);
1043
947
    } else {
1044
11
      ptr.reset(new EncodeLookup);
1045
11
    }
1046
1.97k
    RET_ON_ERR(ptr->init(k.val, *c));
1047
1.97k
    ptr->key = k.val;
1048
1.97k
    return ptr.release();
1049
1.97k
  }
1050
1051
11.2k
  Convert::~Convert() {}
1052
1053
  PosibErr<void> Convert::init(const Config & c, const ConvKey & in, const ConvKey & out)
1054
870
  {
1055
870
    RET_ON_ERR(setup(decode_c, &decode_cache, &c, in));
1056
867
    decode_ = decode_c.get();
1057
867
    RET_ON_ERR(setup(encode_c, &encode_cache, &c, out));
1058
867
    encode_ = encode_c.get();
1059
1060
867
    conv_ = 0;
1061
867
    if (in.val == out.val) {
1062
56
      if (in.val == "ucs-2") {
1063
0
        if (in.allow_ucs) {
1064
0
          conv_ = new ConvDirect<Uni16>;
1065
0
        } else {
1066
0
          return make_err(encoding_not_supported, in.val);
1067
0
        }
1068
56
      } else if (in.val == "ucs-4") {
1069
0
        if (in.allow_ucs) {
1070
0
          conv_ = new ConvDirect<Uni32>;
1071
0
        } else {
1072
0
          return make_err(encoding_not_supported, in.val);
1073
0
        }
1074
56
      } else {
1075
56
        conv_ = new ConvDirect<char>;
1076
56
      }
1077
56
    }
1078
1079
867
    if (conv_)
1080
56
      RET_ON_ERR(conv_->init(decode_, encode_, c));
1081
1082
867
    return no_err;
1083
867
  }
1084
1085
  
1086
  PosibErr<void> Convert::init_norm_from(const Config & c, const ConvKey & in, const ConvKey & out)
1087
2.82k
  {
1088
2.82k
    if (!c.retrieve_bool("normalize") && !c.retrieve_bool("norm-required")) 
1089
204
      return init(c,in,out);
1090
1091
2.61k
    RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, out.val));
1092
1093
2.61k
    RET_ON_ERR(setup(decode_c, &decode_cache, &c, in));
1094
2.61k
    decode_ = decode_c.get();
1095
1096
2.61k
    if (c.retrieve_bool("norm-strict")) {
1097
3
      encode_s = new EncodeNormLookup(norm_tables_->strict);
1098
3
      encode_ = encode_s;
1099
3
      encode_->key = out.val;
1100
3
      encode_->key += ":strict";
1101
2.60k
    } else {
1102
2.60k
      encode_s = new EncodeNormLookup(norm_tables_->internal);
1103
2.60k
      encode_ = encode_s;
1104
2.60k
      encode_->key = out.val;
1105
2.60k
      encode_->key += ":internal";
1106
2.60k
    }
1107
2.61k
    conv_ = 0;
1108
1109
2.61k
    return no_err;
1110
2.61k
  }
1111
1112
  PosibErr<void> Convert::init_norm_to(const Config & c, const ConvKey & in, const ConvKey & out)
1113
8.37k
  {
1114
8.37k
    String norm_form = c.retrieve("norm-form");
1115
8.37k
    if ((!c.retrieve_bool("normalize") || norm_form == "none")
1116
612
        && !c.retrieve_bool("norm-required"))
1117
612
      return init(c,in,out);
1118
7.76k
    if (norm_form == "none" && c.retrieve_bool("norm-required"))
1119
0
      norm_form = "nfc";
1120
1121
7.76k
    RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, in.val));
1122
1123
7.76k
    RET_ON_ERR(setup(encode_c, &encode_cache, &c, out));
1124
7.76k
    encode_ = encode_c.get();
1125
1126
7.76k
    NormTables::ToUni::const_iterator i = norm_tables_->to_uni.begin();
1127
15.5k
    for (; i != norm_tables_->to_uni.end() && i->name != norm_form; ++i);
1128
7.76k
    if (i == norm_tables_->to_uni.end())
1129
0
      return make_err(aerror_bad_value, "norm-form", norm_form, "one of none, nfd, nfc, or comp");
1130
1131
7.76k
    decode_s = new DecodeNormLookup(i->ptr);
1132
7.76k
    decode_ = decode_s;
1133
7.76k
    decode_->key = in.val;
1134
7.76k
    decode_->key += ':';
1135
7.76k
    decode_->key += i->name;
1136
1137
7.76k
    conv_ = 0;
1138
1139
7.76k
    return no_err;
1140
7.76k
  }
1141
1142
  PosibErr<void> MBLen::setup(const Config &, ParmStr enc0)
1143
0
  {
1144
0
    String buf;
1145
0
    const char * enc = fix_encoding_str(enc0,buf);
1146
0
    if      (strcmp(enc, "utf-8") == 0) encoding = UTF8;
1147
0
    else if (strcmp(enc, "ucs-2") == 0) encoding = UCS2;
1148
0
    else if (strcmp(enc, "ucs-4") == 0) encoding = UCS4;
1149
0
    else                                encoding = Other;
1150
0
    return no_err;
1151
0
  }
1152
1153
  unsigned MBLen::operator()(const char * str, const char * stop)
1154
0
  {
1155
0
    unsigned size = 0;
1156
0
    switch (encoding) {
1157
0
    case Other: 
1158
0
      return stop - str;
1159
0
    case UTF8:
1160
0
      for (; str != stop; ++str) {
1161
0
        if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) ++size;
1162
0
      }
1163
0
      return size;
1164
0
    case UCS2:
1165
0
      return (stop - str)/2;
1166
0
    case UCS4:
1167
0
      return (stop - str)/4;
1168
0
    }
1169
0
    return 0;
1170
0
  }
1171
1172
0
  PosibErr<void> unsupported_null_term_wide_string_err_(const char * func) {
1173
0
    static bool reported_to_stderr = false;
1174
0
    PosibErr<void> err = make_err(other_error, unsupported_null_term_wide_string_msg);
1175
0
    if (!reported_to_stderr) {
1176
0
      CERR.printf("ERROR: %s: %s\n", func, unsupported_null_term_wide_string_msg);
1177
0
      reported_to_stderr = true;
1178
0
    }
1179
0
    return err;
1180
0
  }
1181
1182
0
  void unsupported_null_term_wide_string_abort_(const char * func) {
1183
0
    CERR.printf("%s: %s\n", func, unsupported_null_term_wide_string_msg);
1184
0
    abort();
1185
0
  }
1186
 
1187
}