/src/aspell/common/convert.cpp
Line | Count | Source |
1 | | // This file is part of The New Aspell |
2 | | // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license |
3 | | // version 2.0 or 2.1. You should have received a copy of the LGPL |
4 | | // license along with this library if you did not you can find |
5 | | // it at http://www.gnu.org/. |
6 | | |
7 | | #include <assert.h> |
8 | | #include <string.h> |
9 | | #include <math.h> |
10 | | |
11 | | #include "asc_ctype.hpp" |
12 | | #include "convert.hpp" |
13 | | #include "fstream.hpp" |
14 | | #include "getdata.hpp" |
15 | | #include "config.hpp" |
16 | | #include "errors.hpp" |
17 | | #include "stack_ptr.hpp" |
18 | | #include "cache-t.hpp" |
19 | | #include "file_util.hpp" |
20 | | #include "file_data_util.hpp" |
21 | | #include "vararray.hpp" |
22 | | |
23 | | #include "iostream.hpp" |
24 | | |
25 | | #include "gettext.h" |
26 | | |
27 | | namespace acommon { |
28 | | |
29 | | typedef unsigned char byte; |
30 | | typedef unsigned char Uni8; |
31 | | typedef unsigned short Uni16; |
32 | | typedef unsigned int Uni32; |
33 | | |
34 | | |
35 | | ////////////////////////////////////////////////////////////////////// |
36 | | ////////////////////////////////////////////////////////////////////// |
37 | | // |
38 | | // Lookups |
39 | | // |
40 | | ////////////////////////////////////////////////////////////////////// |
41 | | ////////////////////////////////////////////////////////////////////// |
42 | | |
43 | | ////////////////////////////////////////////////////////////////////// |
44 | | // |
45 | | // ToUniLookup |
46 | | // |
47 | | |
48 | | class ToUniLookup |
49 | | { |
50 | | Uni32 data[256]; |
51 | | static const Uni32 npos = (Uni32)(-1); |
52 | | public: |
53 | | void reset(); |
54 | 245k | Uni32 operator[] (char key) const {return data[(unsigned char)key];} |
55 | 0 | bool have(char key) const {return data[(unsigned char)key] != npos;} |
56 | | bool insert(char key, Uni32 value); |
57 | | }; |
58 | | |
59 | | void ToUniLookup::reset() |
60 | 32 | { |
61 | 8.22k | for (int i = 0; i != 256; ++i) |
62 | 8.19k | data[i] = npos; |
63 | 32 | } |
64 | | |
65 | | bool ToUniLookup::insert(char key, Uni32 value) |
66 | 5.63k | { |
67 | 5.63k | if (data[(unsigned char)key] != npos) |
68 | 0 | return false; |
69 | 5.63k | data[(unsigned char)key] = value; |
70 | 5.63k | return true; |
71 | 5.63k | } |
72 | | |
73 | | ////////////////////////////////////////////////////////////////////// |
74 | | // |
75 | | // FromUniLookup |
76 | | // |
77 | | |
78 | | // Assumes that the maximum number of items in the table is 256 |
79 | | // Also assumes (unsigned char)i == i % 256 |
80 | | |
81 | | // Based on the iso-8859-* character sets it is very fast, almost all |
82 | | // lookups involving no more than 2 comparisons. |
83 | | // NO looks ups involded more than 3 compassions. |
84 | | // Also, no division (or modules) is done whatsoever. |
85 | | |
86 | | |
87 | | struct UniItem { |
88 | | Uni32 key; |
89 | | char value; |
90 | | }; |
91 | | |
92 | | class FromUniLookup |
93 | | { |
94 | | private: |
95 | | static const Uni32 npos = (Uni32)(-1); |
96 | | UniItem * overflow_end; |
97 | | |
98 | | UniItem data[256*4]; |
99 | | |
100 | | UniItem overflow[256]; // you can never be too careful; |
101 | | |
102 | | public: |
103 | 979 | FromUniLookup() {} |
104 | | void reset(); |
105 | | inline char operator() (Uni32 key, char unknown = '?') const; |
106 | | bool insert(Uni32 key, char value); |
107 | | }; |
108 | | |
109 | | void FromUniLookup::reset() |
110 | 32 | { |
111 | 32.8k | for (unsigned i = 0; i != 256*4; ++i) |
112 | 32.7k | data[i].key = npos; |
113 | 32 | overflow_end = overflow; |
114 | 32 | } |
115 | | |
116 | | inline char FromUniLookup::operator() (Uni32 k, char unknown) const |
117 | 31.2k | { |
118 | 31.2k | const UniItem * i = data + (unsigned char)k * 4; |
119 | | |
120 | 31.2k | if (i->key == k) return i->value; |
121 | 0 | ++i; |
122 | 0 | if (i->key == k) return i->value; |
123 | 0 | ++i; |
124 | 0 | if (i->key == k) return i->value; |
125 | 0 | ++i; |
126 | 0 | if (i->key == k) return i->value; |
127 | | |
128 | 0 | if (i->key == npos) return unknown; |
129 | | |
130 | 0 | for(i = overflow; i != overflow_end; ++i) |
131 | 0 | if (i->key == k) return i->value; |
132 | | |
133 | 0 | return unknown; |
134 | 0 | } |
135 | | |
136 | | bool FromUniLookup::insert(Uni32 k, char v) |
137 | 5.63k | { |
138 | 5.63k | UniItem * i = data + (unsigned char)k * 4; |
139 | 5.63k | UniItem * e = i + 4; |
140 | 7.21k | while (i != e && i->key != npos) { |
141 | 1.58k | if (i->key == k) |
142 | 0 | return false; |
143 | 1.58k | ++i; |
144 | 1.58k | } |
145 | 5.63k | if (i == e) { |
146 | 0 | for(i = overflow; i != overflow_end; ++i) |
147 | 0 | if (i->key == k) return false; |
148 | 0 | } |
149 | 5.63k | i->key = k; |
150 | 5.63k | i->value = v; |
151 | 5.63k | return true; |
152 | 5.63k | } |
153 | | |
154 | | ////////////////////////////////////////////////////////////////////// |
155 | | // |
156 | | // CharLookup |
157 | | // |
158 | | |
159 | | class CharLookup |
160 | | { |
161 | | private: |
162 | | int data[256]; |
163 | | public: |
164 | | void reset(); |
165 | 0 | char operator[] (char key) const {return data[(unsigned char)key];} |
166 | | bool insert(char key, char value); |
167 | | }; |
168 | | |
169 | 0 | void CharLookup::reset() { |
170 | 0 | for (int i = 0; i != 256; ++i) |
171 | 0 | data[i] = -1; |
172 | 0 | } |
173 | | |
174 | | bool CharLookup::insert(char key, char value) |
175 | 0 | { |
176 | 0 | if (data[(unsigned char)key] != -1) |
177 | 0 | return false; |
178 | 0 | data[(unsigned char)key] = value; |
179 | 0 | return true; |
180 | 0 | } |
181 | | |
182 | | ////////////////////////////////////////////////////////////////////// |
183 | | // |
184 | | // NormLookup |
185 | | // |
186 | | |
187 | | template <class T> |
188 | | struct NormTable |
189 | | { |
190 | | static const unsigned struct_size; |
191 | | unsigned mask; |
192 | | unsigned height; |
193 | | unsigned width; |
194 | | unsigned size; |
195 | | T * end; |
196 | | T data[1]; // hack for data[] |
197 | | }; |
198 | | |
199 | | template <class T> |
200 | | const unsigned NormTable<T>::struct_size = sizeof(NormTable<T>) - 1; |
201 | | |
202 | | template <class T, class From> |
203 | | struct NormLookupRet |
204 | | { |
205 | | const typename T::To * to; |
206 | | const From * last; |
207 | | NormLookupRet(const typename T::To * t, From * l) |
208 | 35.2M | : to(t), last(l) {}acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const>::NormLookupRet(unsigned char const*, acommon::FilterChar const*) Line | Count | Source | 208 | 12.7M | : to(t), last(l) {} |
acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar>::NormLookupRet(unsigned char const*, acommon::FilterChar*) Line | Count | Source | 208 | 20.3M | : to(t), last(l) {} |
acommon::NormLookupRet<acommon::ToUniNormEntry, char const>::NormLookupRet(unsigned short const*, char const*) Line | Count | Source | 208 | 2.09M | : to(t), last(l) {} |
|
209 | | }; |
210 | | |
211 | | template <class T, class From> |
212 | | static inline NormLookupRet<T,From> norm_lookup(const NormTable<T> * d, |
213 | | From * s, From * stop, |
214 | | const typename T::To * def, |
215 | | From * prev) |
216 | 35.2M | { |
217 | 46.8M | loop: |
218 | 46.8M | if (s != stop) { |
219 | 46.8M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); |
220 | 53.9M | for (;;) { |
221 | 53.9M | if (i->from == static_cast<typename T::From>(*s)) { |
222 | 35.2M | if (i->sub_table) { |
223 | | // really tail recursion |
224 | 11.6M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} |
225 | 11.6M | d = (const NormTable<T> *)(i->sub_table); |
226 | 11.6M | s++; |
227 | 11.6M | goto loop; |
228 | 23.5M | } else { |
229 | 23.5M | return NormLookupRet<T,From>(i->to, s); |
230 | 23.5M | } |
231 | 35.2M | } else { |
232 | 18.6M | i += d->height; |
233 | 18.6M | if (i >= d->end) break; |
234 | 18.6M | } |
235 | 53.9M | } |
236 | 46.8M | } |
237 | 11.6M | return NormLookupRet<T,From>(def, prev); |
238 | 46.8M | } convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar const>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar const*, acommon::FilterChar const*, acommon::FromUniNormEntry::To const*, acommon::FilterChar const*) Line | Count | Source | 216 | 12.7M | { | 217 | 18.2M | loop: | 218 | 18.2M | if (s != stop) { | 219 | 18.2M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 21.5M | for (;;) { | 221 | 21.5M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 12.7M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 5.51M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 5.51M | d = (const NormTable<T> *)(i->sub_table); | 226 | 5.51M | s++; | 227 | 5.51M | goto loop; | 228 | 7.25M | } else { | 229 | 7.25M | return NormLookupRet<T,From>(i->to, s); | 230 | 7.25M | } | 231 | 12.7M | } else { | 232 | 8.83M | i += d->height; | 233 | 8.83M | if (i >= d->end) break; | 234 | 8.83M | } | 235 | 21.5M | } | 236 | 18.2M | } | 237 | 5.51M | return NormLookupRet<T,From>(def, prev); | 238 | 18.2M | } |
convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar*, acommon::FilterChar*, acommon::FromUniNormEntry::To const*, acommon::FilterChar*) Line | Count | Source | 216 | 20.3M | { | 217 | 26.4M | loop: | 218 | 26.4M | if (s != stop) { | 219 | 26.4M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 30.2M | for (;;) { | 221 | 30.2M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 20.3M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 6.09M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 6.09M | d = (const NormTable<T> *)(i->sub_table); | 226 | 6.09M | s++; | 227 | 6.09M | goto loop; | 228 | 14.2M | } else { | 229 | 14.2M | return NormLookupRet<T,From>(i->to, s); | 230 | 14.2M | } | 231 | 20.3M | } else { | 232 | 9.86M | i += d->height; | 233 | 9.86M | if (i >= d->end) break; | 234 | 9.86M | } | 235 | 30.2M | } | 236 | 26.4M | } | 237 | 6.13M | return NormLookupRet<T,From>(def, prev); | 238 | 26.4M | } |
convert.cpp:acommon::NormLookupRet<acommon::ToUniNormEntry, char const> acommon::norm_lookup<acommon::ToUniNormEntry, char const>(acommon::NormTable<acommon::ToUniNormEntry> const*, char const*, char const*, acommon::ToUniNormEntry::To const*, char const*) Line | Count | Source | 216 | 2.09M | { | 217 | 2.09M | loop: | 218 | 2.09M | if (s != stop) { | 219 | 2.09M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 2.09M | for (;;) { | 221 | 2.09M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 2.09M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 0 | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 0 | d = (const NormTable<T> *)(i->sub_table); | 226 | 0 | s++; | 227 | 0 | goto loop; | 228 | 2.09M | } else { | 229 | 2.09M | return NormLookupRet<T,From>(i->to, s); | 230 | 2.09M | } | 231 | 2.09M | } else { | 232 | 0 | i += d->height; | 233 | 0 | if (i >= d->end) break; | 234 | 0 | } | 235 | 2.09M | } | 236 | 2.09M | } | 237 | 0 | return NormLookupRet<T,From>(def, prev); | 238 | 2.09M | } |
|
239 | | |
240 | | template <class T> |
241 | | void free_norm_table(NormTable<T> * d) |
242 | 31.6k | { |
243 | 4.73M | for (T * cur = d->data; cur != d->end; ++cur) { |
244 | 4.70M | if (cur->sub_table) |
245 | 28.0k | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); |
246 | 4.70M | } |
247 | 31.6k | free(d); |
248 | 31.6k | } void acommon::free_norm_table<acommon::FromUniNormEntry>(acommon::NormTable<acommon::FromUniNormEntry>*) Line | Count | Source | 242 | 29.8k | { | 243 | 4.28M | for (T * cur = d->data; cur != d->end; ++cur) { | 244 | 4.25M | if (cur->sub_table) | 245 | 28.0k | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); | 246 | 4.25M | } | 247 | 29.8k | free(d); | 248 | 29.8k | } |
void acommon::free_norm_table<acommon::ToUniNormEntry>(acommon::NormTable<acommon::ToUniNormEntry>*) Line | Count | Source | 242 | 1.75k | { | 243 | 451k | for (T * cur = d->data; cur != d->end; ++cur) { | 244 | 449k | if (cur->sub_table) | 245 | 0 | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); | 246 | 449k | } | 247 | 1.75k | free(d); | 248 | 1.75k | } |
|
249 | | |
250 | | struct FromUniNormEntry |
251 | | { |
252 | | typedef Uni32 From; |
253 | | Uni32 from; |
254 | | typedef byte To; |
255 | | byte to[4]; |
256 | | static const From from_non_char = (From)(-1); |
257 | | static const To to_non_char = 0x10; |
258 | | static const unsigned max_to = 4; |
259 | | void * sub_table; |
260 | | } |
261 | | #ifdef __GNUC__ |
262 | | __attribute__ ((aligned (16))) |
263 | | #endif |
264 | | ; |
265 | | |
266 | | struct ToUniNormEntry |
267 | | { |
268 | | typedef byte From; |
269 | | byte from; |
270 | | typedef Uni16 To; |
271 | | Uni16 to[3]; |
272 | | static const From from_non_char = 0x10; |
273 | | static const To to_non_char = 0x10; |
274 | | static const unsigned max_to = 3; |
275 | | void * sub_table; |
276 | | } |
277 | | #ifdef __GNUC__ |
278 | | __attribute__ ((aligned (16))) |
279 | | #endif |
280 | | ; |
281 | | |
282 | | ////////////////////////////////////////////////////////////////////// |
283 | | // |
284 | | // read in char data |
285 | | // |
286 | | |
287 | | PosibErr<void> read_in_char_data (const Config & config, |
288 | | ParmStr encoding, |
289 | | ToUniLookup & to, |
290 | | FromUniLookup & from) |
291 | 32 | { |
292 | 32 | to.reset(); |
293 | 32 | from.reset(); |
294 | | |
295 | 32 | String dir1,dir2,file_name; |
296 | 32 | fill_data_dir(&config, dir1, dir2); |
297 | 32 | find_file(file_name,dir1,dir2,encoding,".cset"); |
298 | | |
299 | 32 | FStream data; |
300 | 32 | PosibErrBase err = data.open(file_name, "r"); |
301 | 32 | if (err.get_err()) { |
302 | 10 | char mesg[300]; |
303 | 10 | snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), |
304 | 10 | file_name.c_str()); |
305 | 10 | return make_err(unknown_encoding, encoding, mesg); |
306 | 10 | } |
307 | 22 | unsigned chr; |
308 | 22 | Uni32 uni; |
309 | 22 | String line; |
310 | 22 | char * p; |
311 | 66 | do { |
312 | 66 | p = get_nb_line(data, line); |
313 | 66 | } while (*p != '/'); |
314 | 5.65k | for (chr = 0; chr != 256; ++chr) { |
315 | 5.63k | p = get_nb_line(data, line); |
316 | 5.63k | if (strtoul(p, 0, 16) != chr) |
317 | 0 | return make_err(bad_file_format, file_name); |
318 | 5.63k | uni = strtoul(p + 3, 0, 16); |
319 | 5.63k | to.insert(chr, uni); |
320 | 5.63k | from.insert(uni, chr); |
321 | 5.63k | } |
322 | | |
323 | 22 | return no_err; |
324 | 22 | } |
325 | | |
326 | | ////////////////////////////////////////////////////////////////////// |
327 | | // |
328 | | // read in norm data |
329 | | // |
330 | | |
331 | | struct Tally |
332 | | { |
333 | | int size; |
334 | | Uni32 mask; |
335 | | int max; |
336 | | int * data; |
337 | 94.8k | Tally(int s, int * d) : size(s), mask(s - 1), max(0), data(d) { |
338 | 94.8k | memset(data, 0, sizeof(int)*size); |
339 | 94.8k | } |
340 | 4.83M | void add(Uni32 chr) { |
341 | 4.83M | Uni32 p = chr & mask; |
342 | 4.83M | data[p]++; |
343 | 4.83M | if (data[p] > max) max = data[p]; |
344 | 4.83M | } |
345 | | }; |
346 | | |
347 | | # define sanity(check) \ |
348 | 8.28M | if (!(check)) return sanity_fail(__FILE__, FUNC, __LINE__, #check) |
349 | | |
350 | | static PosibErrBase sanity_fail(const char * file, const char * func, |
351 | | unsigned line, const char * check_str) |
352 | 0 | { |
353 | 0 | char mesg[500]; |
354 | 0 | snprintf(mesg, 500, "%s:%d: %s: Assertion \"%s\" failed.", |
355 | 0 | file, line, func, check_str); |
356 | 0 | return make_err(bad_input_error, mesg); |
357 | 0 | } |
358 | | # define CREATE_NORM_TABLE(T, in, buf, res) \ |
359 | 31.6k | do { PosibErr<NormTable<T> *> pe( create_norm_table<T>(in,buf) );\ |
360 | 31.6k | if (pe.has_err()) return PosibErrBase(pe); \ |
361 | 31.6k | res = pe.data; } while(false) |
362 | | |
363 | | template <class T> |
364 | | static PosibErr< NormTable<T> * > create_norm_table(IStream & in, String & buf) |
365 | 31.6k | { |
366 | 31.6k | const char FUNC[] = "create_norm_table"; |
367 | 31.6k | const char * p = get_nb_line(in, buf); |
368 | 31.6k | sanity(*p == 'N'); |
369 | 31.6k | ++p; |
370 | 31.6k | int size = strtoul(p, (char **)&p, 10); |
371 | 31.6k | VARARRAY(T, d, size); |
372 | 31.6k | memset(d, 0, sizeof(T) * size); |
373 | 31.6k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); |
374 | 31.6k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); |
375 | 31.6k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); |
376 | 31.6k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); |
377 | 31.6k | T * cur = d; |
378 | 1.64M | while (p = get_nb_line(in, buf), *p != '.') { |
379 | 1.61M | Uni32 f = strtoul(p, (char **)&p, 16); |
380 | 1.61M | cur->from = static_cast<typename T::From>(f); |
381 | 1.61M | sanity(f == cur->from); |
382 | 1.61M | tally0.add(f); |
383 | 1.61M | tally1.add(f); |
384 | 1.61M | tally2.add(f); |
385 | 1.61M | ++p; |
386 | 1.61M | sanity(*p == '>'); |
387 | 1.61M | ++p; |
388 | 1.61M | sanity(*p == ' '); |
389 | 1.61M | ++p; |
390 | 1.61M | unsigned i = 0; |
391 | 1.61M | if (*p != '-') { |
392 | 3.30M | for (;; ++i) { |
393 | 3.30M | const char * q = p; |
394 | 3.30M | Uni32 t = strtoul(p, (char **)&p, 16); |
395 | 3.30M | if (q == p) break; |
396 | 1.68M | sanity(i < d->max_to); |
397 | 1.68M | cur->to[i] = static_cast<typename T::To>(t); |
398 | 1.68M | sanity(t == static_cast<Uni32>(cur->to[i])); |
399 | 1.68M | } |
400 | 1.61M | } else { |
401 | 0 | cur->to[0] = 0; |
402 | 0 | cur->to[1] = T::to_non_char; |
403 | 0 | } |
404 | 1.61M | if (*p == ' ') ++p; |
405 | 1.61M | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); |
406 | 1.61M | ++cur; |
407 | 1.61M | } |
408 | 31.6k | sanity(cur - d == size); |
409 | 31.6k | Tally * which = &tally0; |
410 | 31.6k | if (which->max > tally1.max) which = &tally1; |
411 | 31.6k | if (which->max > tally2.max) which = &tally2; |
412 | 31.6k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + |
413 | 31.6k | sizeof(T) * which->size * which->max); |
414 | 31.6k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); |
415 | 31.6k | final->mask = which->size - 1; |
416 | 31.6k | final->height = which->size; |
417 | 31.6k | final->width = which->max; |
418 | 31.6k | final->end = final->data + which->size * which->max; |
419 | 31.6k | final->size = size; |
420 | 1.64M | for (cur = d; cur != d + size; ++cur) { |
421 | 1.61M | T * dest = final->data + (cur->from & final->mask); |
422 | 1.67M | while (dest->from != 0) dest += final->height; |
423 | 1.61M | *dest = *cur; |
424 | 1.61M | if (dest->from == 0) dest->from = T::from_non_char; |
425 | 1.61M | } |
426 | 74.6k | for (T * dest = final->data; dest < final->end; dest += final->height) { |
427 | 43.0k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { |
428 | 3.51k | dest->from = T::from_non_char; |
429 | 3.51k | dest->to[0] = T::to_non_char; |
430 | 3.51k | } |
431 | 43.0k | } |
432 | 31.6k | return final; |
433 | 31.6k | } convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::FromUniNormEntry>*> acommon::create_norm_table<acommon::FromUniNormEntry>(acommon::IStream&, acommon::String&) Line | Count | Source | 365 | 29.8k | { | 366 | 29.8k | const char FUNC[] = "create_norm_table"; | 367 | 29.8k | const char * p = get_nb_line(in, buf); | 368 | 29.8k | sanity(*p == 'N'); | 369 | 29.8k | ++p; | 370 | 29.8k | int size = strtoul(p, (char **)&p, 10); | 371 | 29.8k | VARARRAY(T, d, size); | 372 | 29.8k | memset(d, 0, sizeof(T) * size); | 373 | 29.8k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); | 374 | 29.8k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); | 375 | 29.8k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); | 376 | 29.8k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); | 377 | 29.8k | T * cur = d; | 378 | 1.19M | while (p = get_nb_line(in, buf), *p != '.') { | 379 | 1.16M | Uni32 f = strtoul(p, (char **)&p, 16); | 380 | 1.16M | cur->from = static_cast<typename T::From>(f); | 381 | 1.16M | sanity(f == cur->from); | 382 | 1.16M | tally0.add(f); | 383 | 1.16M | tally1.add(f); | 384 | 1.16M | tally2.add(f); | 385 | 1.16M | ++p; | 386 | 1.16M | sanity(*p == '>'); | 387 | 1.16M | ++p; | 388 | 1.16M | sanity(*p == ' '); | 389 | 1.16M | ++p; | 390 | 1.16M | unsigned i = 0; | 391 | 1.16M | if (*p != '-') { | 392 | 2.35M | for (;; ++i) { | 393 | 2.35M | const char * q = p; | 394 | 2.35M | Uni32 t = strtoul(p, (char **)&p, 16); | 395 | 2.35M | if (q == p) break; | 396 | 1.19M | sanity(i < d->max_to); | 397 | 1.19M | cur->to[i] = static_cast<typename T::To>(t); | 398 | 1.19M | sanity(t == static_cast<Uni32>(cur->to[i])); | 399 | 1.19M | } | 400 | 1.16M | } else { | 401 | 0 | cur->to[0] = 0; | 402 | 0 | cur->to[1] = T::to_non_char; | 403 | 0 | } | 404 | 1.16M | if (*p == ' ') ++p; | 405 | 1.16M | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); | 406 | 1.16M | ++cur; | 407 | 1.16M | } | 408 | 29.8k | sanity(cur - d == size); | 409 | 29.8k | Tally * which = &tally0; | 410 | 29.8k | if (which->max > tally1.max) which = &tally1; | 411 | 29.8k | if (which->max > tally2.max) which = &tally2; | 412 | 29.8k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + | 413 | 29.8k | sizeof(T) * which->size * which->max); | 414 | 29.8k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); | 415 | 29.8k | final->mask = which->size - 1; | 416 | 29.8k | final->height = which->size; | 417 | 29.8k | final->width = which->max; | 418 | 29.8k | final->end = final->data + which->size * which->max; | 419 | 29.8k | final->size = size; | 420 | 1.19M | for (cur = d; cur != d + size; ++cur) { | 421 | 1.16M | T * dest = final->data + (cur->from & final->mask); | 422 | 1.22M | while (dest->from != 0) dest += final->height; | 423 | 1.16M | *dest = *cur; | 424 | 1.16M | if (dest->from == 0) dest->from = T::from_non_char; | 425 | 1.16M | } | 426 | 71.1k | for (T * dest = final->data; dest < final->end; dest += final->height) { | 427 | 41.2k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { | 428 | 1.75k | dest->from = T::from_non_char; | 429 | 1.75k | dest->to[0] = T::to_non_char; | 430 | 1.75k | } | 431 | 41.2k | } | 432 | 29.8k | return final; | 433 | 29.8k | } |
convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::ToUniNormEntry>*> acommon::create_norm_table<acommon::ToUniNormEntry>(acommon::IStream&, acommon::String&) Line | Count | Source | 365 | 1.75k | { | 366 | 1.75k | const char FUNC[] = "create_norm_table"; | 367 | 1.75k | const char * p = get_nb_line(in, buf); | 368 | 1.75k | sanity(*p == 'N'); | 369 | 1.75k | ++p; | 370 | 1.75k | int size = strtoul(p, (char **)&p, 10); | 371 | 1.75k | VARARRAY(T, d, size); | 372 | 1.75k | memset(d, 0, sizeof(T) * size); | 373 | 1.75k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); | 374 | 1.75k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); | 375 | 1.75k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); | 376 | 1.75k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); | 377 | 1.75k | T * cur = d; | 378 | 451k | while (p = get_nb_line(in, buf), *p != '.') { | 379 | 449k | Uni32 f = strtoul(p, (char **)&p, 16); | 380 | 449k | cur->from = static_cast<typename T::From>(f); | 381 | 449k | sanity(f == cur->from); | 382 | 449k | tally0.add(f); | 383 | 449k | tally1.add(f); | 384 | 449k | tally2.add(f); | 385 | 449k | ++p; | 386 | 449k | sanity(*p == '>'); | 387 | 449k | ++p; | 388 | 449k | sanity(*p == ' '); | 389 | 449k | ++p; | 390 | 449k | unsigned i = 0; | 391 | 449k | if (*p != '-') { | 392 | 945k | for (;; ++i) { | 393 | 945k | const char * q = p; | 394 | 945k | Uni32 t = strtoul(p, (char **)&p, 16); | 395 | 945k | if (q == p) break; | 396 | 496k | sanity(i < d->max_to); | 397 | 496k | cur->to[i] = static_cast<typename T::To>(t); | 398 | 496k | sanity(t == static_cast<Uni32>(cur->to[i])); | 399 | 496k | } | 400 | 449k | } else { | 401 | 0 | cur->to[0] = 0; | 402 | 0 | cur->to[1] = T::to_non_char; | 403 | 0 | } | 404 | 449k | if (*p == ' ') ++p; | 405 | 449k | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); | 406 | 449k | ++cur; | 407 | 449k | } | 408 | 1.75k | sanity(cur - d == size); | 409 | 1.75k | Tally * which = &tally0; | 410 | 1.75k | if (which->max > tally1.max) which = &tally1; | 411 | 1.75k | if (which->max > tally2.max) which = &tally2; | 412 | 1.75k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + | 413 | 1.75k | sizeof(T) * which->size * which->max); | 414 | 1.75k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); | 415 | 1.75k | final->mask = which->size - 1; | 416 | 1.75k | final->height = which->size; | 417 | 1.75k | final->width = which->max; | 418 | 1.75k | final->end = final->data + which->size * which->max; | 419 | 1.75k | final->size = size; | 420 | 451k | for (cur = d; cur != d + size; ++cur) { | 421 | 449k | T * dest = final->data + (cur->from & final->mask); | 422 | 449k | while (dest->from != 0) dest += final->height; | 423 | 449k | *dest = *cur; | 424 | 449k | if (dest->from == 0) dest->from = T::from_non_char; | 425 | 449k | } | 426 | 3.51k | for (T * dest = final->data; dest < final->end; dest += final->height) { | 427 | 1.75k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { | 428 | 1.75k | dest->from = T::from_non_char; | 429 | 1.75k | dest->to[0] = T::to_non_char; | 430 | 1.75k | } | 431 | 1.75k | } | 432 | 1.75k | return final; | 433 | 1.75k | } |
|
434 | | |
435 | | static PosibErr<void> init_norm_tables(FStream & in, NormTables * d) |
436 | 878 | { |
437 | 878 | const char FUNC[] = "init_norm_tables"; |
438 | 878 | String l; |
439 | 878 | get_nb_line(in, l); |
440 | 878 | remove_comments(l); |
441 | 878 | sanity (l == "INTERNAL"); |
442 | 878 | get_nb_line(in, l); |
443 | 878 | remove_comments(l); |
444 | 878 | sanity (l == "/"); |
445 | 878 | CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->internal); |
446 | 878 | get_nb_line(in, l); |
447 | 878 | remove_comments(l); |
448 | 878 | sanity (l == "STRICT"); |
449 | 878 | char * p = get_nb_line(in, l); |
450 | 878 | remove_comments(l); |
451 | 878 | if (l == "/") { |
452 | 878 | CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->strict_d); |
453 | 878 | d->strict = d->strict_d; |
454 | 878 | } else { |
455 | 0 | sanity(*p == '='); |
456 | 0 | ++p; ++p; |
457 | 0 | sanity(strcmp(p, "INTERNAL") == 0); |
458 | 0 | d->strict = d->internal; |
459 | 0 | } |
460 | 3.51k | while (get_nb_line(in, l)) { |
461 | 2.63k | remove_comments(l); |
462 | 2.63k | d->to_uni.push_back(NormTables::ToUniTable()); |
463 | 2.63k | NormTables::ToUniTable & e = d->to_uni.back(); |
464 | 2.63k | e.name.resize(l.size()); |
465 | 11.4k | for (unsigned i = 0; i != l.size(); ++i) |
466 | 8.78k | e.name[i] = asc_tolower(l[i]); |
467 | 2.63k | char * p = get_nb_line(in, l); |
468 | 2.63k | remove_comments(l); |
469 | 2.63k | if (l == "/") { |
470 | 1.75k | CREATE_NORM_TABLE(ToUniNormEntry, in, l, e.data); |
471 | 1.75k | e.ptr = e.data; |
472 | 1.75k | } else { |
473 | 878 | sanity(*p == '='); |
474 | 878 | ++p; ++p; |
475 | 3.51k | for (char * q = p; *q; ++q) *q = asc_tolower(*q); |
476 | 878 | Vector<NormTables::ToUniTable>::iterator i = d->to_uni.begin(); |
477 | 1.75k | while (i->name != p && i != d->to_uni.end()) ++i; |
478 | 878 | sanity(i != d->to_uni.end()); |
479 | 878 | e.ptr = i->ptr; |
480 | 878 | get_nb_line(in, l); |
481 | 878 | } |
482 | 2.63k | } |
483 | 878 | return no_err; |
484 | 878 | } |
485 | | |
486 | | PosibErr<NormTables *> NormTables::get_new(const String & encoding, |
487 | | const Config * config) |
488 | 878 | { |
489 | 878 | String dir1,dir2,file_name; |
490 | 878 | fill_data_dir(config, dir1, dir2); |
491 | 878 | find_file(file_name,dir1,dir2,encoding,".cmap"); |
492 | | |
493 | 878 | FStream in; |
494 | 878 | PosibErrBase err = in.open(file_name, "r"); |
495 | 878 | if (err.get_err()) { |
496 | 0 | char mesg[300]; |
497 | 0 | snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), |
498 | 0 | file_name.c_str()); |
499 | 0 | return make_err(unknown_encoding, encoding, mesg); // FIXME |
500 | 0 | } |
501 | | |
502 | 878 | NormTables * d = new NormTables; |
503 | 878 | d->key = encoding; |
504 | 878 | err = init_norm_tables(in, d); |
505 | 878 | if (err.has_err()) { |
506 | 0 | return make_err(bad_file_format, file_name, err.get_err()->mesg); |
507 | 0 | } |
508 | | |
509 | 878 | return d; |
510 | | |
511 | 878 | } |
512 | | |
513 | | NormTables::~NormTables() |
514 | 878 | { |
515 | 878 | free_norm_table<FromUniNormEntry>(internal); |
516 | 878 | if (strict_d) |
517 | 878 | free_norm_table<FromUniNormEntry>(strict_d); |
518 | 3.51k | for (unsigned i = 0; i != to_uni.size(); ++i) { |
519 | 2.63k | if (to_uni[i].data) |
520 | 1.75k | free_norm_table<ToUniNormEntry>(to_uni[i].data); |
521 | 2.63k | } |
522 | 878 | } |
523 | | |
524 | | ////////////////////////////////////////////////////////////////////// |
525 | | ////////////////////////////////////////////////////////////////////// |
526 | | // |
527 | | // Convert |
528 | | // |
529 | | ////////////////////////////////////////////////////////////////////// |
530 | | ////////////////////////////////////////////////////////////////////// |
531 | | |
532 | | |
533 | | bool operator== (const Convert & rhs, const Convert & lhs) |
534 | 0 | { |
535 | 0 | return strcmp(rhs.in_code(), lhs.in_code()) == 0 |
536 | 0 | && strcmp(rhs.out_code(), lhs.out_code()) == 0; |
537 | 0 | } |
538 | | |
539 | | ////////////////////////////////////////////////////////////////////// |
540 | | // |
541 | | // Trivial Conversion |
542 | | // |
543 | | |
544 | | const char * unsupported_null_term_wide_string_msg = |
545 | | "Null-terminated wide-character strings unsupported when used this way."; |
546 | | |
547 | | template <typename Chr> |
548 | | struct DecodeDirect : public Decode |
549 | | { |
550 | 967 | DecodeDirect() {type_width = sizeof(Chr);}acommon::DecodeDirect<unsigned char>::DecodeDirect() Line | Count | Source | 550 | 828 | DecodeDirect() {type_width = sizeof(Chr);} |
acommon::DecodeDirect<unsigned short>::DecodeDirect() Line | Count | Source | 550 | 26 | DecodeDirect() {type_width = sizeof(Chr);} |
acommon::DecodeDirect<unsigned int>::DecodeDirect() Line | Count | Source | 550 | 113 | DecodeDirect() {type_width = sizeof(Chr);} |
|
551 | 221k | void decode(const char * in0, int size, FilterCharVector & out) const { |
552 | 221k | const Chr * in = reinterpret_cast<const Chr *>(in0); |
553 | 221k | if (size == -sizeof(Chr)) { |
554 | 761k | for (;*in; ++in) |
555 | 554k | out.append(*in, sizeof(Chr)); |
556 | 207k | } else if (size <= -1) { |
557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); |
558 | 0 | abort(); |
559 | 14.7k | } else { |
560 | 14.7k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); |
561 | 36.2M | for (;in != stop; ++in) |
562 | 36.2M | out.append(*in, sizeof(Chr)); |
563 | 14.7k | } |
564 | 221k | } acommon::DecodeDirect<unsigned char>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 219k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 219k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 219k | if (size == -sizeof(Chr)) { | 554 | 761k | for (;*in; ++in) | 555 | 554k | out.append(*in, sizeof(Chr)); | 556 | 207k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 12.2k | } else { | 560 | 12.2k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 35.7M | for (;in != stop; ++in) | 562 | 35.7M | out.append(*in, sizeof(Chr)); | 563 | 12.2k | } | 564 | 219k | } |
acommon::DecodeDirect<unsigned short>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 1.42k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 1.42k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 1.42k | if (size == -sizeof(Chr)) { | 554 | 0 | for (;*in; ++in) | 555 | 0 | out.append(*in, sizeof(Chr)); | 556 | 1.42k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 1.42k | } else { | 560 | 1.42k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 421k | for (;in != stop; ++in) | 562 | 419k | out.append(*in, sizeof(Chr)); | 563 | 1.42k | } | 564 | 1.42k | } |
acommon::DecodeDirect<unsigned int>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 1.06k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 1.06k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 1.06k | if (size == -sizeof(Chr)) { | 554 | 0 | for (;*in; ++in) | 555 | 0 | out.append(*in, sizeof(Chr)); | 556 | 1.06k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 1.06k | } else { | 560 | 1.06k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 82.9k | for (;in != stop; ++in) | 562 | 81.9k | out.append(*in, sizeof(Chr)); | 563 | 1.06k | } | 564 | 1.06k | } |
|
565 | | PosibErr<void> decode_ec(const char * in0, int size, |
566 | 0 | FilterCharVector & out, ParmStr) const { |
567 | 0 | DecodeDirect::decode(in0, size, out); |
568 | 0 | return no_err; |
569 | 0 | } Unexecuted instantiation: acommon::DecodeDirect<unsigned char>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const Unexecuted instantiation: acommon::DecodeDirect<unsigned short>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const Unexecuted instantiation: acommon::DecodeDirect<unsigned int>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const |
570 | | }; |
571 | | |
572 | | template <typename Chr> |
573 | | struct EncodeDirect : public Encode |
574 | | { |
575 | 1.01k | EncodeDirect() {type_width = sizeof(Chr);}acommon::EncodeDirect<unsigned char>::EncodeDirect() Line | Count | Source | 575 | 828 | EncodeDirect() {type_width = sizeof(Chr);} |
acommon::EncodeDirect<unsigned short>::EncodeDirect() Line | Count | Source | 575 | 26 | EncodeDirect() {type_width = sizeof(Chr);} |
acommon::EncodeDirect<unsigned int>::EncodeDirect() Line | Count | Source | 575 | 162 | EncodeDirect() {type_width = sizeof(Chr);} |
|
576 | | void encode(const FilterChar * in, const FilterChar * stop, |
577 | 770k | CharVector & out) const { |
578 | 3.56M | for (; in != stop; ++in) { |
579 | 2.79M | Chr c = in->chr; |
580 | 2.79M | if (c != in->chr) c = '?'; |
581 | 2.79M | out.append(&c, sizeof(Chr)); |
582 | 2.79M | } |
583 | 770k | } acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 563k | CharVector & out) const { | 578 | 2.80M | for (; in != stop; ++in) { | 579 | 2.24M | Chr c = in->chr; | 580 | 2.24M | if (c != in->chr) c = '?'; | 581 | 2.24M | out.append(&c, sizeof(Chr)); | 582 | 2.24M | } | 583 | 563k | } |
acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 145k | CharVector & out) const { | 578 | 583k | for (; in != stop; ++in) { | 579 | 438k | Chr c = in->chr; | 580 | 438k | if (c != in->chr) c = '?'; | 581 | 438k | out.append(&c, sizeof(Chr)); | 582 | 438k | } | 583 | 145k | } |
acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 62.0k | CharVector & out) const { | 578 | 178k | for (; in != stop; ++in) { | 579 | 115k | Chr c = in->chr; | 580 | 115k | if (c != in->chr) c = '?'; | 581 | 115k | out.append(&c, sizeof(Chr)); | 582 | 115k | } | 583 | 62.0k | } |
|
584 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
585 | 4.58k | CharVector & out, ParmStr orig) const { |
586 | 38.8k | for (; in != stop; ++in) { |
587 | 34.2k | Chr c = in->chr; |
588 | 34.2k | if (c != in->chr) { |
589 | 0 | char m[70]; |
590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
591 | 0 | return make_err(invalid_string, orig, m); |
592 | 0 | } |
593 | | |
594 | 34.2k | out.append(&c, sizeof(Chr)); |
595 | 34.2k | } |
596 | 4.58k | return no_err; |
597 | 4.58k | } acommon::EncodeDirect<unsigned char>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const Line | Count | Source | 585 | 1.50k | CharVector & out, ParmStr orig) const { | 586 | 4.51k | for (; in != stop; ++in) { | 587 | 3.00k | Chr c = in->chr; | 588 | 3.00k | if (c != in->chr) { | 589 | 0 | char m[70]; | 590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); | 591 | 0 | return make_err(invalid_string, orig, m); | 592 | 0 | } | 593 | | | 594 | 3.00k | out.append(&c, sizeof(Chr)); | 595 | 3.00k | } | 596 | 1.50k | return no_err; | 597 | 1.50k | } |
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const acommon::EncodeDirect<unsigned int>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const Line | Count | Source | 585 | 3.07k | CharVector & out, ParmStr orig) const { | 586 | 34.3k | for (; in != stop; ++in) { | 587 | 31.2k | Chr c = in->chr; | 588 | 31.2k | if (c != in->chr) { | 589 | 0 | char m[70]; | 590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); | 591 | 0 | return make_err(invalid_string, orig, m); | 592 | 0 | } | 593 | | | 594 | 31.2k | out.append(&c, sizeof(Chr)); | 595 | 31.2k | } | 596 | 3.07k | return no_err; | 597 | 3.07k | } |
|
598 | 65 | bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const { |
599 | 65 | return true; |
600 | 65 | } acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const Line | Count | Source | 598 | 65 | bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const { | 599 | 65 | return true; | 600 | 65 | } |
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const Unexecuted instantiation: acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const |
601 | | }; |
602 | | |
603 | | template <typename Chr> |
604 | | struct ConvDirect : public DirectConv |
605 | | { |
606 | 56 | ConvDirect() {type_width = sizeof(Chr);}Unexecuted instantiation: acommon::ConvDirect<unsigned short>::ConvDirect() Unexecuted instantiation: acommon::ConvDirect<unsigned int>::ConvDirect() acommon::ConvDirect<char>::ConvDirect() Line | Count | Source | 606 | 56 | ConvDirect() {type_width = sizeof(Chr);} |
|
607 | 41.7k | void convert(const char * in0, int size, CharVector & out) const { |
608 | 41.7k | if (size == -sizeof(Chr)) { |
609 | 40.8k | const Chr * in = reinterpret_cast<const Chr *>(in0); |
610 | 179k | for (;*in != 0; ++in) |
611 | 138k | out.append(in, sizeof(Chr)); |
612 | 40.8k | } else if (size <= -1) { |
613 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); |
614 | 0 | abort(); |
615 | 914 | } else { |
616 | 914 | out.append(in0, size); |
617 | 914 | } |
618 | 41.7k | } Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert(char const*, int, acommon::String&) const Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert(char const*, int, acommon::String&) const acommon::ConvDirect<char>::convert(char const*, int, acommon::String&) const Line | Count | Source | 607 | 41.7k | void convert(const char * in0, int size, CharVector & out) const { | 608 | 41.7k | if (size == -sizeof(Chr)) { | 609 | 40.8k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 610 | 179k | for (;*in != 0; ++in) | 611 | 138k | out.append(in, sizeof(Chr)); | 612 | 40.8k | } else if (size <= -1) { | 613 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 614 | 0 | abort(); | 615 | 914 | } else { | 616 | 914 | out.append(in0, size); | 617 | 914 | } | 618 | 41.7k | } |
|
619 | | PosibErr<void> convert_ec(const char * in0, int size, |
620 | 0 | CharVector & out, ParmStr) const { |
621 | 0 | ConvDirect::convert(in0, size, out); |
622 | 0 | return no_err; |
623 | 0 | } Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const Unexecuted instantiation: acommon::ConvDirect<char>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const |
624 | | }; |
625 | | |
626 | | ////////////////////////////////////////////////////////////////////// |
627 | | // |
628 | | // Lookup Conversion |
629 | | // |
630 | | |
631 | | struct DecodeLookup : public Decode |
632 | | { |
633 | | ToUniLookup lookup; |
634 | 21 | PosibErr<void> init(ParmStr code, const Config & c) { |
635 | 21 | FromUniLookup unused; |
636 | 21 | return read_in_char_data(c, code, lookup, unused); |
637 | 21 | } |
638 | 791 | void decode(const char * in, int size, FilterCharVector & out) const { |
639 | 791 | if (size == -1) { |
640 | 0 | for (;*in; ++in) |
641 | 0 | out.append(lookup[*in]); |
642 | 791 | } else { |
643 | 791 | const char * stop = in + size; |
644 | 246k | for (;in != stop; ++in) |
645 | 245k | out.append(lookup[*in]); |
646 | 791 | } |
647 | 791 | } |
648 | | PosibErr<void> decode_ec(const char * in, int size, |
649 | 0 | FilterCharVector & out, ParmStr) const { |
650 | 0 | DecodeLookup::decode(in, size, out); |
651 | 0 | return no_err; |
652 | 0 | } |
653 | | }; |
654 | | |
655 | | struct DecodeNormLookup : public Decode |
656 | | { |
657 | | typedef ToUniNormEntry E; |
658 | | NormTable<E> * data; |
659 | 7.76k | DecodeNormLookup(NormTable<E> * d) : data(d) {} |
660 | | // must be null terminated |
661 | | // FIXME: Why must it be null terminated? |
662 | 568k | void decode(const char * in, int size, FilterCharVector & out) const { |
663 | 568k | const char * stop = in + size; // will work even if size -1 |
664 | 2.66M | while (in != stop) { |
665 | 2.66M | if (*in == 0) { |
666 | 568k | if (size == -1) break; |
667 | 0 | out.append(0); |
668 | 0 | ++in; |
669 | 2.09M | } else { |
670 | 2.09M | NormLookupRet<E,const char> ret = norm_lookup<E>(data, in, stop, 0, in); |
671 | 4.19M | for (unsigned i = 0; ret.to[i] && i < E::max_to; ++i) |
672 | 2.09M | out.append(ret.to[i]); |
673 | 2.09M | in = ret.last + 1; |
674 | 2.09M | } |
675 | 2.66M | } |
676 | 568k | } |
677 | | PosibErr<void> decode_ec(const char * in, int size, |
678 | 0 | FilterCharVector & out, ParmStr) const { |
679 | 0 | DecodeNormLookup::decode(in, size, out); |
680 | 0 | return no_err; |
681 | 0 | } |
682 | | }; |
683 | | |
684 | | struct EncodeLookup : public Encode |
685 | | { |
686 | | FromUniLookup lookup; |
687 | | PosibErr<void> init(ParmStr code, const Config & c) |
688 | 11 | {ToUniLookup unused; |
689 | 11 | return read_in_char_data(c, code, unused, lookup);} |
690 | | void encode(const FilterChar * in, const FilterChar * stop, |
691 | 7.34k | CharVector & out) const { |
692 | 38.6k | for (; in != stop; ++in) { |
693 | 31.2k | out.append(lookup(*in)); |
694 | 31.2k | } |
695 | 7.34k | } |
696 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
697 | 0 | CharVector & out, ParmStr orig) const { |
698 | 0 | for (; in != stop; ++in) { |
699 | 0 | char c = lookup(*in, '\0'); |
700 | 0 | if (c == '\0' && in->chr != 0) { |
701 | 0 | char m[70]; |
702 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
703 | 0 | return make_err(invalid_string, orig, m); |
704 | 0 | } |
705 | 0 | out.append(c); |
706 | 0 | } |
707 | 0 | return no_err; |
708 | 0 | } |
709 | | bool encode(FilterChar * & in0, FilterChar * & stop, |
710 | 0 | FilterCharVector & out) const { |
711 | 0 | FilterChar * in = in0; |
712 | 0 | for (; in != stop; ++in) |
713 | 0 | *in = lookup(*in); |
714 | 0 | return true; |
715 | 0 | } |
716 | | }; |
717 | | |
718 | | struct EncodeNormLookup : public Encode |
719 | | { |
720 | | typedef FromUniNormEntry E; |
721 | | NormTable<E> * data; |
722 | 2.61k | EncodeNormLookup(NormTable<E> * d) : data(d) {} |
723 | | // *stop must equal 0 |
724 | | void encode(const FilterChar * in, const FilterChar * stop, |
725 | 12.2k | CharVector & out) const { |
726 | 12.7M | while (in < stop) { |
727 | 12.7M | if (*in == 0) { |
728 | 0 | out.append('\0'); |
729 | 0 | ++in; |
730 | 12.7M | } else { |
731 | 12.7M | NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in); |
732 | 25.4M | for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) |
733 | 12.7M | out.append(ret.to[i]); |
734 | 12.7M | in = ret.last + 1; |
735 | 12.7M | } |
736 | 12.7M | } |
737 | 12.2k | } |
738 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
739 | 19.6k | CharVector & out, ParmStr orig) const { |
740 | 58.9k | while (in < stop) { |
741 | 39.3k | if (*in == 0) { |
742 | 0 | out.append('\0'); |
743 | 0 | ++in; |
744 | 39.3k | } else { |
745 | 39.3k | NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, 0, in); |
746 | 39.3k | if (ret.to == 0) { |
747 | 0 | char m[70]; |
748 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
749 | 0 | return make_err(invalid_string, orig, m); |
750 | 0 | } |
751 | 78.6k | for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) |
752 | 39.3k | out.append(ret.to[i]); |
753 | 39.3k | in = ret.last + 1; |
754 | 39.3k | } |
755 | 39.3k | } |
756 | 19.6k | return no_err; |
757 | 19.6k | } |
758 | | bool encode(FilterChar * & in, FilterChar * & stop, |
759 | 845 | FilterCharVector & buf) const { |
760 | 845 | buf.clear(); |
761 | 23.0M | while (in < stop) { |
762 | 23.0M | if (*in == 0) { |
763 | 2.65M | buf.append(FilterChar(0)); |
764 | 2.65M | ++in; |
765 | 20.3M | } else { |
766 | 20.3M | NormLookupRet<E,FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in); |
767 | 20.3M | const FilterChar * end = ret.last + 1; |
768 | 20.3M | unsigned width = 0; |
769 | 40.7M | for (; in != end; ++in) width += in->width; |
770 | 20.3M | buf.append(FilterChar(ret.to[0], width)); |
771 | 20.3M | for (unsigned i = 1; i < E::max_to && ret.to[i]; ++i) { |
772 | 1 | buf.append(FilterChar(ret.to[i],0)); |
773 | 1 | } |
774 | 20.3M | } |
775 | 23.0M | } |
776 | 845 | buf.append(0); |
777 | 845 | in = buf.pbegin(); |
778 | 845 | stop = buf.pend(); |
779 | 845 | return true; |
780 | 845 | } |
781 | | }; |
782 | | |
783 | | ////////////////////////////////////////////////////////////////////// |
784 | | // |
785 | | // UTF8 |
786 | | // |
787 | | |
788 | | #define get_check_next \ |
789 | 1.25k | if (in == stop) goto error; \ |
790 | 1.25k | c = *in; \ |
791 | 1.25k | if ((c & 0xC0/*1100 0000*/) != 0x80/*10xx xxxx*/) goto error;\ |
792 | 1.25k | ++in; \ |
793 | 1.24k | u <<= 6; \ |
794 | 1.24k | u |= c & 0x3F/*0011 1111*/; \ |
795 | 1.24k | ++w; |
796 | | |
797 | | static inline FilterChar from_utf8 (const char * & in, const char * stop = 0, |
798 | | Uni32 err_char = '?') |
799 | 73.6k | { |
800 | 73.6k | Uni32 u = (Uni32)(-1); |
801 | 73.6k | FilterChar::Width w = 1; |
802 | | |
803 | | // the first char is guaranteed not to be off the end |
804 | 73.6k | char c = *in; |
805 | 73.6k | ++in; |
806 | | |
807 | 73.6k | if ((c & 0x80/*1000 0000*/) == 0x00/*0xxx xxx*/) { |
808 | 73.0k | u = c; |
809 | 73.0k | } else if ((c & 0xE0/*1110 0000*/) == 0xC0/*110x xxxx*/) { // 2-byte wide |
810 | 68 | u = c & 0x1F/*0001 1111*/; |
811 | 136 | get_check_next; |
812 | 526 | } else if ((c & 0xF0/*1111 0000*/) == 0xE0/*1110 xxxx*/) { // 3-byte wide |
813 | 378 | u = c & 0x0F/*0000 1111*/; |
814 | 754 | get_check_next; |
815 | 754 | get_check_next; |
816 | 751 | } else if ((c & 0xF8/*1111 1000*/) == 0xF0/*1111 0xxx*/) { // 4-byte wide |
817 | 145 | u = c & 0x07/*0000 0111*/; |
818 | 289 | get_check_next; |
819 | 289 | get_check_next; |
820 | 287 | get_check_next; |
821 | 286 | } else { |
822 | 3 | goto error; |
823 | 3 | } |
824 | | |
825 | 73.6k | return FilterChar(u, w); |
826 | 8 | error: |
827 | 8 | return FilterChar(err_char, w); |
828 | 73.6k | } |
829 | | |
830 | | static inline void to_utf8 (FilterChar in, CharVector & out) |
831 | 0 | { |
832 | 0 | FilterChar::Chr c = in; |
833 | | |
834 | 0 | if (c < 0x80) { |
835 | 0 | out.append(c); |
836 | 0 | } |
837 | 0 | else if (c < 0x800) { |
838 | 0 | out.append(0xC0 | (c>>6)); |
839 | 0 | out.append(0x80 | (c & 0x3F)); |
840 | 0 | } |
841 | 0 | else if (c < 0x10000) { |
842 | 0 | out.append(0xE0 | (c>>12)); |
843 | 0 | out.append(0x80 | (c>>6 & 0x3F)); |
844 | 0 | out.append(0x80 | (c & 0x3F)); |
845 | 0 | } |
846 | 0 | else if (c < 0x200000) { |
847 | 0 | out.append(0xF0 | (c>>18)); |
848 | 0 | out.append(0x80 | (c>>12 & 0x3F)); |
849 | 0 | out.append(0x80 | (c>>6 & 0x3F)); |
850 | 0 | out.append(0x80 | (c & 0x3F)); |
851 | 0 | } |
852 | 0 | } |
853 | | |
854 | | struct DecodeUtf8 : public Decode |
855 | | { |
856 | | ToUniLookup lookup; |
857 | 0 | void decode(const char * in, int size, FilterCharVector & out) const { |
858 | 0 | if (size == -1) { |
859 | 0 | while (*in) |
860 | 0 | out.append(from_utf8(in)); |
861 | 0 | } else { |
862 | 0 | const char * stop = in + size; |
863 | 0 | while (in != stop) |
864 | 0 | out.append(from_utf8(in, stop)); |
865 | 0 | } |
866 | 0 | } |
867 | | PosibErr<void> decode_ec(const char * in, int size, |
868 | 24.2k | FilterCharVector & out, ParmStr orig) const { |
869 | 24.2k | const char * begin = in; |
870 | 24.2k | if (size == -1) { |
871 | 34.3k | while (*in) { |
872 | 31.3k | FilterChar c = from_utf8(in, 0, (Uni32)-1); |
873 | 31.3k | if (c == (Uni32)-1) goto error; |
874 | 31.3k | out.append(c); |
875 | 31.3k | } |
876 | 21.1k | } else { |
877 | 21.1k | const char * stop = in + size; |
878 | 63.4k | while (in != stop) { |
879 | 42.3k | FilterChar c = from_utf8(in, stop, (Uni32)-1); |
880 | 42.3k | if (c == (Uni32)-1) goto error; |
881 | 42.3k | out.append(c); |
882 | 42.3k | } |
883 | 21.1k | } |
884 | 24.2k | return no_err; |
885 | 8 | error: |
886 | 8 | char m[70]; |
887 | 8 | snprintf(m, 70, _("Invalid UTF-8 sequence at position %ld."), (long)(in - begin)); |
888 | 8 | return make_err(invalid_string, orig, m); |
889 | 24.2k | } |
890 | | }; |
891 | | |
892 | | struct EncodeUtf8 : public Encode |
893 | | { |
894 | | FromUniLookup lookup; |
895 | | void encode(const FilterChar * in, const FilterChar * stop, |
896 | 0 | CharVector & out) const { |
897 | 0 | for (; in != stop; ++in) { |
898 | 0 | to_utf8(*in, out); |
899 | 0 | } |
900 | 0 | } |
901 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
902 | 0 | CharVector & out, ParmStr) const { |
903 | 0 | for (; in != stop; ++in) { |
904 | 0 | to_utf8(*in, out); |
905 | 0 | } |
906 | 0 | return no_err; |
907 | 0 | } |
908 | | }; |
909 | | |
910 | | ////////////////////////////////////////////////////////////////////// |
911 | | // |
912 | | // Cache |
913 | | // |
914 | | |
915 | | static GlobalCache<Decode> decode_cache("decode"); |
916 | | static GlobalCache<Encode> encode_cache("encode"); |
917 | | static GlobalCache<NormTables> norm_tables_cache("norm_tables"); |
918 | | |
919 | | ////////////////////////////////////////////////////////////////////// |
920 | | // |
921 | | // new_aspell_convert |
922 | | // |
923 | | |
924 | | void Convert::generic_convert(const char * in, int size, CharVector & out) |
925 | 1.32k | { |
926 | 1.32k | buf_.clear(); |
927 | 1.32k | decode_->decode(in, size, buf_); |
928 | 1.32k | FilterChar * start = buf_.pbegin(); |
929 | 1.32k | FilterChar * stop = buf_.pend(); |
930 | 1.32k | if (!filter.empty()) |
931 | 1.32k | filter.process(start, stop); |
932 | 1.32k | encode_->encode(start, stop, out); |
933 | 1.32k | } |
934 | | |
935 | | const char * fix_encoding_str(ParmStr enc, String & buf) |
936 | 40.4k | { |
937 | 40.4k | buf.clear(); |
938 | 40.4k | buf.reserve(enc.size() + 1); |
939 | 414k | for (size_t i = 0; i != enc.size(); ++i) |
940 | 373k | buf.push_back(asc_tolower(enc[i])); |
941 | | |
942 | 40.4k | if (strncmp(buf.c_str(), "iso8859", 7) == 0) |
943 | 2.70k | buf.insert(buf.begin() + 3, '-'); // For backwards compatibility |
944 | | |
945 | 40.4k | if (buf == "ascii" || buf == "ansi_x3.4-1968") |
946 | 0 | return "iso-8859-1"; |
947 | 40.4k | else if (buf == "machine unsigned 16" || buf == "utf-16") |
948 | 52 | return "ucs-2"; |
949 | 40.4k | else if (buf == "machine unsigned 32" || buf == "utf-32") |
950 | 226 | return "ucs-4"; |
951 | 40.2k | else |
952 | 40.2k | return buf.c_str(); |
953 | 40.4k | } |
954 | | |
955 | | bool ascii_encoding(const Config & c, ParmStr enc0) |
956 | 2.11k | { |
957 | 2.11k | if (enc0.empty()) return true; |
958 | 2.11k | if (enc0 == "ANSI_X3.4-1968" |
959 | 2.11k | || enc0 == "ASCII" || enc0 == "ascii") return true; |
960 | 0 | String buf; |
961 | 0 | const char * enc = fix_encoding_str(enc0, buf); |
962 | 0 | if (strcmp(enc, "utf-8") == 0 |
963 | 0 | || strcmp(enc, "ucs-2") == 0 |
964 | 0 | || strcmp(enc, "ucs-4") == 0) return false; |
965 | 0 | String dir1,dir2,file_name; |
966 | 0 | fill_data_dir(&c, dir1, dir2); |
967 | 0 | file_name << dir1 << enc << ".cset"; |
968 | 0 | if (file_exists(file_name)) return false; |
969 | 0 | if (dir1 == dir2) return true; |
970 | 0 | file_name.clear(); |
971 | 0 | file_name << dir2 << enc << ".cset"; |
972 | 0 | return !file_exists(file_name); |
973 | 0 | } |
974 | | |
975 | | PosibErr<Convert *> internal_new_convert(const Config & c, |
976 | | ConvKey in, |
977 | | ConvKey out, |
978 | | bool if_needed, |
979 | | Normalize norm) |
980 | 18.8k | { |
981 | 18.8k | String in_s; |
982 | 18.8k | in.val = fix_encoding_str(in.val, in_s); |
983 | | |
984 | 18.8k | String out_s; |
985 | 18.8k | out.val = fix_encoding_str(out.val, out_s); |
986 | | |
987 | 18.8k | if (if_needed && in.val == out.val) return 0; |
988 | | |
989 | 11.2k | StackPtr<Convert> conv(new Convert); |
990 | 11.2k | switch (norm) { |
991 | 54 | case NormNone: |
992 | 54 | RET_ON_ERR(conv->init(c, in, out)); break; |
993 | 2.82k | case NormFrom: |
994 | 2.82k | RET_ON_ERR(conv->init_norm_from(c, in, out)); break; |
995 | 8.37k | case NormTo: |
996 | 8.37k | RET_ON_ERR(conv->init_norm_to(c, in, out)); break; |
997 | 11.2k | } |
998 | 11.2k | return conv.release(); |
999 | 11.2k | } |
1000 | | |
1001 | | PosibErr<Decode *> Decode::get_new(const ConvKey & k, const Config * c) |
1002 | 1.93k | { |
1003 | 1.93k | StackPtr<Decode> ptr; |
1004 | 1.93k | if (k.val == "iso-8859-1") { |
1005 | 828 | ptr.reset(new DecodeDirect<Uni8>); |
1006 | 1.10k | } else if (k.val == "ucs-2") { |
1007 | 26 | if (k.allow_ucs) |
1008 | 26 | ptr.reset(new DecodeDirect<Uni16>); |
1009 | 0 | else |
1010 | 0 | return make_err(encoding_not_supported, k.val); |
1011 | 1.08k | } else if (k.val == "ucs-4") { |
1012 | 113 | if (k.allow_ucs) |
1013 | 113 | ptr.reset(new DecodeDirect<Uni32>); |
1014 | 0 | else |
1015 | 0 | return make_err(encoding_not_supported, k.val); |
1016 | 968 | } else if (k.val == "utf-8") { |
1017 | 947 | ptr.reset(new DecodeUtf8); |
1018 | 947 | } else { |
1019 | 21 | ptr.reset(new DecodeLookup); |
1020 | 21 | } |
1021 | 1.93k | RET_ON_ERR(ptr->init(k.val, *c)); |
1022 | 1.92k | ptr->key = k.val; |
1023 | 1.92k | return ptr.release(); |
1024 | 1.93k | } |
1025 | | |
1026 | | PosibErr<Encode *> Encode::get_new(const ConvKey & k, const Config * c) |
1027 | 1.97k | { |
1028 | 1.97k | StackPtr<Encode> ptr; |
1029 | 1.97k | if (k.val == "iso-8859-1") { |
1030 | 828 | ptr.reset(new EncodeDirect<Uni8>); |
1031 | 1.14k | } else if (k.val == "ucs-2" && k.allow_ucs) { |
1032 | 26 | if (k.allow_ucs) |
1033 | 26 | ptr.reset(new EncodeDirect<Uni16>); |
1034 | 0 | else |
1035 | 0 | return make_err(encoding_not_supported, k.val); |
1036 | 1.12k | } else if (k.val == "ucs-4" && k.allow_ucs) { |
1037 | 162 | if (k.allow_ucs) |
1038 | 162 | ptr.reset(new EncodeDirect<Uni32>); |
1039 | 0 | else |
1040 | 0 | return make_err(encoding_not_supported, k.val); |
1041 | 958 | } else if (k.val == "utf-8") { |
1042 | 947 | ptr.reset(new EncodeUtf8); |
1043 | 947 | } else { |
1044 | 11 | ptr.reset(new EncodeLookup); |
1045 | 11 | } |
1046 | 1.97k | RET_ON_ERR(ptr->init(k.val, *c)); |
1047 | 1.97k | ptr->key = k.val; |
1048 | 1.97k | return ptr.release(); |
1049 | 1.97k | } |
1050 | | |
1051 | 11.2k | Convert::~Convert() {} |
1052 | | |
1053 | | PosibErr<void> Convert::init(const Config & c, const ConvKey & in, const ConvKey & out) |
1054 | 870 | { |
1055 | 870 | RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); |
1056 | 867 | decode_ = decode_c.get(); |
1057 | 867 | RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); |
1058 | 867 | encode_ = encode_c.get(); |
1059 | | |
1060 | 867 | conv_ = 0; |
1061 | 867 | if (in.val == out.val) { |
1062 | 56 | if (in.val == "ucs-2") { |
1063 | 0 | if (in.allow_ucs) { |
1064 | 0 | conv_ = new ConvDirect<Uni16>; |
1065 | 0 | } else { |
1066 | 0 | return make_err(encoding_not_supported, in.val); |
1067 | 0 | } |
1068 | 56 | } else if (in.val == "ucs-4") { |
1069 | 0 | if (in.allow_ucs) { |
1070 | 0 | conv_ = new ConvDirect<Uni32>; |
1071 | 0 | } else { |
1072 | 0 | return make_err(encoding_not_supported, in.val); |
1073 | 0 | } |
1074 | 56 | } else { |
1075 | 56 | conv_ = new ConvDirect<char>; |
1076 | 56 | } |
1077 | 56 | } |
1078 | | |
1079 | 867 | if (conv_) |
1080 | 56 | RET_ON_ERR(conv_->init(decode_, encode_, c)); |
1081 | | |
1082 | 867 | return no_err; |
1083 | 867 | } |
1084 | | |
1085 | | |
1086 | | PosibErr<void> Convert::init_norm_from(const Config & c, const ConvKey & in, const ConvKey & out) |
1087 | 2.82k | { |
1088 | 2.82k | if (!c.retrieve_bool("normalize") && !c.retrieve_bool("norm-required")) |
1089 | 204 | return init(c,in,out); |
1090 | | |
1091 | 2.61k | RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, out.val)); |
1092 | | |
1093 | 2.61k | RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); |
1094 | 2.61k | decode_ = decode_c.get(); |
1095 | | |
1096 | 2.61k | if (c.retrieve_bool("norm-strict")) { |
1097 | 3 | encode_s = new EncodeNormLookup(norm_tables_->strict); |
1098 | 3 | encode_ = encode_s; |
1099 | 3 | encode_->key = out.val; |
1100 | 3 | encode_->key += ":strict"; |
1101 | 2.60k | } else { |
1102 | 2.60k | encode_s = new EncodeNormLookup(norm_tables_->internal); |
1103 | 2.60k | encode_ = encode_s; |
1104 | 2.60k | encode_->key = out.val; |
1105 | 2.60k | encode_->key += ":internal"; |
1106 | 2.60k | } |
1107 | 2.61k | conv_ = 0; |
1108 | | |
1109 | 2.61k | return no_err; |
1110 | 2.61k | } |
1111 | | |
1112 | | PosibErr<void> Convert::init_norm_to(const Config & c, const ConvKey & in, const ConvKey & out) |
1113 | 8.37k | { |
1114 | 8.37k | String norm_form = c.retrieve("norm-form"); |
1115 | 8.37k | if ((!c.retrieve_bool("normalize") || norm_form == "none") |
1116 | 612 | && !c.retrieve_bool("norm-required")) |
1117 | 612 | return init(c,in,out); |
1118 | 7.76k | if (norm_form == "none" && c.retrieve_bool("norm-required")) |
1119 | 0 | norm_form = "nfc"; |
1120 | | |
1121 | 7.76k | RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, in.val)); |
1122 | | |
1123 | 7.76k | RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); |
1124 | 7.76k | encode_ = encode_c.get(); |
1125 | | |
1126 | 7.76k | NormTables::ToUni::const_iterator i = norm_tables_->to_uni.begin(); |
1127 | 15.5k | for (; i != norm_tables_->to_uni.end() && i->name != norm_form; ++i); |
1128 | 7.76k | if (i == norm_tables_->to_uni.end()) |
1129 | 0 | return make_err(aerror_bad_value, "norm-form", norm_form, "one of none, nfd, nfc, or comp"); |
1130 | | |
1131 | 7.76k | decode_s = new DecodeNormLookup(i->ptr); |
1132 | 7.76k | decode_ = decode_s; |
1133 | 7.76k | decode_->key = in.val; |
1134 | 7.76k | decode_->key += ':'; |
1135 | 7.76k | decode_->key += i->name; |
1136 | | |
1137 | 7.76k | conv_ = 0; |
1138 | | |
1139 | 7.76k | return no_err; |
1140 | 7.76k | } |
1141 | | |
1142 | | PosibErr<void> MBLen::setup(const Config &, ParmStr enc0) |
1143 | 0 | { |
1144 | 0 | String buf; |
1145 | 0 | const char * enc = fix_encoding_str(enc0,buf); |
1146 | 0 | if (strcmp(enc, "utf-8") == 0) encoding = UTF8; |
1147 | 0 | else if (strcmp(enc, "ucs-2") == 0) encoding = UCS2; |
1148 | 0 | else if (strcmp(enc, "ucs-4") == 0) encoding = UCS4; |
1149 | 0 | else encoding = Other; |
1150 | 0 | return no_err; |
1151 | 0 | } |
1152 | | |
1153 | | unsigned MBLen::operator()(const char * str, const char * stop) |
1154 | 0 | { |
1155 | 0 | unsigned size = 0; |
1156 | 0 | switch (encoding) { |
1157 | 0 | case Other: |
1158 | 0 | return stop - str; |
1159 | 0 | case UTF8: |
1160 | 0 | for (; str != stop; ++str) { |
1161 | 0 | if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) ++size; |
1162 | 0 | } |
1163 | 0 | return size; |
1164 | 0 | case UCS2: |
1165 | 0 | return (stop - str)/2; |
1166 | 0 | case UCS4: |
1167 | 0 | return (stop - str)/4; |
1168 | 0 | } |
1169 | 0 | return 0; |
1170 | 0 | } |
1171 | | |
1172 | 0 | PosibErr<void> unsupported_null_term_wide_string_err_(const char * func) { |
1173 | 0 | static bool reported_to_stderr = false; |
1174 | 0 | PosibErr<void> err = make_err(other_error, unsupported_null_term_wide_string_msg); |
1175 | 0 | if (!reported_to_stderr) { |
1176 | 0 | CERR.printf("ERROR: %s: %s\n", func, unsupported_null_term_wide_string_msg); |
1177 | 0 | reported_to_stderr = true; |
1178 | 0 | } |
1179 | 0 | return err; |
1180 | 0 | } |
1181 | | |
1182 | 0 | void unsupported_null_term_wide_string_abort_(const char * func) { |
1183 | 0 | CERR.printf("%s: %s\n", func, unsupported_null_term_wide_string_msg); |
1184 | 0 | abort(); |
1185 | 0 | } |
1186 | | |
1187 | | } |