/src/aspell/common/convert.cpp
Line | Count | Source |
1 | | // This file is part of The New Aspell |
2 | | // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license |
3 | | // version 2.0 or 2.1. You should have received a copy of the LGPL |
4 | | // license along with this library if you did not you can find |
5 | | // it at http://www.gnu.org/. |
6 | | |
7 | | #include <assert.h> |
8 | | #include <string.h> |
9 | | #include <math.h> |
10 | | |
11 | | #include "asc_ctype.hpp" |
12 | | #include "convert.hpp" |
13 | | #include "fstream.hpp" |
14 | | #include "getdata.hpp" |
15 | | #include "config.hpp" |
16 | | #include "errors.hpp" |
17 | | #include "stack_ptr.hpp" |
18 | | #include "cache-t.hpp" |
19 | | #include "file_util.hpp" |
20 | | #include "file_data_util.hpp" |
21 | | #include "vararray.hpp" |
22 | | |
23 | | #include "iostream.hpp" |
24 | | |
25 | | #include "gettext.h" |
26 | | |
27 | | namespace acommon { |
28 | | |
29 | | typedef unsigned char byte; |
30 | | typedef unsigned char Uni8; |
31 | | typedef unsigned short Uni16; |
32 | | typedef unsigned int Uni32; |
33 | | |
34 | | |
35 | | ////////////////////////////////////////////////////////////////////// |
36 | | ////////////////////////////////////////////////////////////////////// |
37 | | // |
38 | | // Lookups |
39 | | // |
40 | | ////////////////////////////////////////////////////////////////////// |
41 | | ////////////////////////////////////////////////////////////////////// |
42 | | |
43 | | ////////////////////////////////////////////////////////////////////// |
44 | | // |
45 | | // ToUniLookup |
46 | | // |
47 | | |
48 | | class ToUniLookup |
49 | | { |
50 | | Uni32 data[256]; |
51 | | static const Uni32 npos = (Uni32)(-1); |
52 | | public: |
53 | | void reset(); |
54 | 391k | Uni32 operator[] (char key) const {return data[(unsigned char)key];} |
55 | 0 | bool have(char key) const {return data[(unsigned char)key] != npos;} |
56 | | bool insert(char key, Uni32 value); |
57 | | }; |
58 | | |
59 | | void ToUniLookup::reset() |
60 | 51 | { |
61 | 13.1k | for (int i = 0; i != 256; ++i) |
62 | 13.0k | data[i] = npos; |
63 | 51 | } |
64 | | |
65 | | bool ToUniLookup::insert(char key, Uni32 value) |
66 | 9.21k | { |
67 | 9.21k | if (data[(unsigned char)key] != npos) |
68 | 0 | return false; |
69 | 9.21k | data[(unsigned char)key] = value; |
70 | 9.21k | return true; |
71 | 9.21k | } |
72 | | |
73 | | ////////////////////////////////////////////////////////////////////// |
74 | | // |
75 | | // FromUniLookup |
76 | | // |
77 | | |
78 | | // Assumes that the maximum number of items in the table is 256 |
79 | | // Also assumes (unsigned char)i == i % 256 |
80 | | |
81 | | // Based on the iso-8859-* character sets it is very fast, almost all |
82 | | // lookups involving no more than 2 comparisons. |
83 | | // NO looks ups involded more than 3 compassions. |
84 | | // Also, no division (or modules) is done whatsoever. |
85 | | |
86 | | |
87 | | struct UniItem { |
88 | | Uni32 key; |
89 | | char value; |
90 | | }; |
91 | | |
92 | | class FromUniLookup |
93 | | { |
94 | | private: |
95 | | static const Uni32 npos = (Uni32)(-1); |
96 | | UniItem * overflow_end; |
97 | | |
98 | | UniItem data[256*4]; |
99 | | |
100 | | UniItem overflow[256]; // you can never be too careful; |
101 | | |
102 | | public: |
103 | 1.24k | FromUniLookup() {} |
104 | | void reset(); |
105 | | inline char operator() (Uni32 key, char unknown = '?') const; |
106 | | bool insert(Uni32 key, char value); |
107 | | }; |
108 | | |
109 | | void FromUniLookup::reset() |
110 | 51 | { |
111 | 52.2k | for (unsigned i = 0; i != 256*4; ++i) |
112 | 52.2k | data[i].key = npos; |
113 | 51 | overflow_end = overflow; |
114 | 51 | } |
115 | | |
116 | | inline char FromUniLookup::operator() (Uni32 k, char unknown) const |
117 | 53.0k | { |
118 | 53.0k | const UniItem * i = data + (unsigned char)k * 4; |
119 | | |
120 | 53.0k | if (i->key == k) return i->value; |
121 | 41 | ++i; |
122 | 41 | if (i->key == k) return i->value; |
123 | 0 | ++i; |
124 | 0 | if (i->key == k) return i->value; |
125 | 0 | ++i; |
126 | 0 | if (i->key == k) return i->value; |
127 | | |
128 | 0 | if (i->key == npos) return unknown; |
129 | | |
130 | 0 | for(i = overflow; i != overflow_end; ++i) |
131 | 0 | if (i->key == k) return i->value; |
132 | | |
133 | 0 | return unknown; |
134 | 0 | } |
135 | | |
136 | | bool FromUniLookup::insert(Uni32 k, char v) |
137 | 9.21k | { |
138 | 9.21k | UniItem * i = data + (unsigned char)k * 4; |
139 | 9.21k | UniItem * e = i + 4; |
140 | 10.9k | while (i != e && i->key != npos) { |
141 | 1.76k | if (i->key == k) |
142 | 0 | return false; |
143 | 1.76k | ++i; |
144 | 1.76k | } |
145 | 9.21k | if (i == e) { |
146 | 0 | for(i = overflow; i != overflow_end; ++i) |
147 | 0 | if (i->key == k) return false; |
148 | 0 | } |
149 | 9.21k | i->key = k; |
150 | 9.21k | i->value = v; |
151 | 9.21k | return true; |
152 | 9.21k | } |
153 | | |
154 | | ////////////////////////////////////////////////////////////////////// |
155 | | // |
156 | | // CharLookup |
157 | | // |
158 | | |
159 | | class CharLookup |
160 | | { |
161 | | private: |
162 | | int data[256]; |
163 | | public: |
164 | | void reset(); |
165 | 0 | char operator[] (char key) const {return data[(unsigned char)key];} |
166 | | bool insert(char key, char value); |
167 | | }; |
168 | | |
169 | 0 | void CharLookup::reset() { |
170 | 0 | for (int i = 0; i != 256; ++i) |
171 | 0 | data[i] = -1; |
172 | 0 | } |
173 | | |
174 | | bool CharLookup::insert(char key, char value) |
175 | 0 | { |
176 | 0 | if (data[(unsigned char)key] != -1) |
177 | 0 | return false; |
178 | 0 | data[(unsigned char)key] = value; |
179 | 0 | return true; |
180 | 0 | } |
181 | | |
182 | | ////////////////////////////////////////////////////////////////////// |
183 | | // |
184 | | // NormLookup |
185 | | // |
186 | | |
187 | | template <class T> |
188 | | struct NormTable |
189 | | { |
190 | | static const unsigned struct_size; |
191 | | unsigned mask; |
192 | | unsigned height; |
193 | | unsigned width; |
194 | | unsigned size; |
195 | | T * end; |
196 | | T data[1]; // hack for data[] |
197 | | }; |
198 | | |
199 | | template <class T> |
200 | | const unsigned NormTable<T>::struct_size = sizeof(NormTable<T>) - 1; |
201 | | |
202 | | template <class T, class From> |
203 | | struct NormLookupRet |
204 | | { |
205 | | const typename T::To * to; |
206 | | const From * last; |
207 | | NormLookupRet(const typename T::To * t, From * l) |
208 | 46.4M | : to(t), last(l) {}acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const>::NormLookupRet(unsigned char const*, acommon::FilterChar const*) Line | Count | Source | 208 | 16.4M | : to(t), last(l) {} |
acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar>::NormLookupRet(unsigned char const*, acommon::FilterChar*) Line | Count | Source | 208 | 26.7M | : to(t), last(l) {} |
acommon::NormLookupRet<acommon::ToUniNormEntry, char const>::NormLookupRet(unsigned short const*, char const*) Line | Count | Source | 208 | 3.34M | : to(t), last(l) {} |
|
209 | | }; |
210 | | |
211 | | template <class T, class From> |
212 | | static inline NormLookupRet<T,From> norm_lookup(const NormTable<T> * d, |
213 | | From * s, From * stop, |
214 | | const typename T::To * def, |
215 | | From * prev) |
216 | 46.4M | { |
217 | 64.4M | loop: |
218 | 64.4M | if (s != stop) { |
219 | 64.4M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); |
220 | 72.7M | for (;;) { |
221 | 72.7M | if (i->from == static_cast<typename T::From>(*s)) { |
222 | 46.4M | if (i->sub_table) { |
223 | | // really tail recursion |
224 | 17.9M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} |
225 | 17.9M | d = (const NormTable<T> *)(i->sub_table); |
226 | 17.9M | s++; |
227 | 17.9M | goto loop; |
228 | 28.5M | } else { |
229 | 28.5M | return NormLookupRet<T,From>(i->to, s); |
230 | 28.5M | } |
231 | 46.4M | } else { |
232 | 26.2M | i += d->height; |
233 | 26.2M | if (i >= d->end) break; |
234 | 26.2M | } |
235 | 72.7M | } |
236 | 64.4M | } |
237 | 17.9M | return NormLookupRet<T,From>(def, prev); |
238 | 64.4M | } convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar const> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar const>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar const*, acommon::FilterChar const*, acommon::FromUniNormEntry::To const*, acommon::FilterChar const*) Line | Count | Source | 216 | 16.4M | { | 217 | 24.3M | loop: | 218 | 24.3M | if (s != stop) { | 219 | 24.3M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 28.3M | for (;;) { | 221 | 28.3M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 16.4M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 7.92M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 7.92M | d = (const NormTable<T> *)(i->sub_table); | 226 | 7.92M | s++; | 227 | 7.92M | goto loop; | 228 | 8.51M | } else { | 229 | 8.51M | return NormLookupRet<T,From>(i->to, s); | 230 | 8.51M | } | 231 | 16.4M | } else { | 232 | 11.9M | i += d->height; | 233 | 11.9M | if (i >= d->end) break; | 234 | 11.9M | } | 235 | 28.3M | } | 236 | 24.3M | } | 237 | 7.92M | return NormLookupRet<T,From>(def, prev); | 238 | 24.3M | } |
convert.cpp:acommon::NormLookupRet<acommon::FromUniNormEntry, acommon::FilterChar> acommon::norm_lookup<acommon::FromUniNormEntry, acommon::FilterChar>(acommon::NormTable<acommon::FromUniNormEntry> const*, acommon::FilterChar*, acommon::FilterChar*, acommon::FromUniNormEntry::To const*, acommon::FilterChar*) Line | Count | Source | 216 | 26.7M | { | 217 | 36.7M | loop: | 218 | 36.7M | if (s != stop) { | 219 | 36.7M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 41.0M | for (;;) { | 221 | 41.0M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 26.6M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 10.0M | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 10.0M | d = (const NormTable<T> *)(i->sub_table); | 226 | 10.0M | s++; | 227 | 10.0M | goto loop; | 228 | 16.6M | } else { | 229 | 16.6M | return NormLookupRet<T,From>(i->to, s); | 230 | 16.6M | } | 231 | 26.6M | } else { | 232 | 14.3M | i += d->height; | 233 | 14.3M | if (i >= d->end) break; | 234 | 14.3M | } | 235 | 41.0M | } | 236 | 36.7M | } | 237 | 10.0M | return NormLookupRet<T,From>(def, prev); | 238 | 36.7M | } |
convert.cpp:acommon::NormLookupRet<acommon::ToUniNormEntry, char const> acommon::norm_lookup<acommon::ToUniNormEntry, char const>(acommon::NormTable<acommon::ToUniNormEntry> const*, char const*, char const*, acommon::ToUniNormEntry::To const*, char const*) Line | Count | Source | 216 | 3.34M | { | 217 | 3.34M | loop: | 218 | 3.34M | if (s != stop) { | 219 | 3.34M | const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask); | 220 | 3.34M | for (;;) { | 221 | 3.34M | if (i->from == static_cast<typename T::From>(*s)) { | 222 | 3.34M | if (i->sub_table) { | 223 | | // really tail recursion | 224 | 0 | if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} | 225 | 0 | d = (const NormTable<T> *)(i->sub_table); | 226 | 0 | s++; | 227 | 0 | goto loop; | 228 | 3.34M | } else { | 229 | 3.34M | return NormLookupRet<T,From>(i->to, s); | 230 | 3.34M | } | 231 | 3.34M | } else { | 232 | 0 | i += d->height; | 233 | 0 | if (i >= d->end) break; | 234 | 0 | } | 235 | 3.34M | } | 236 | 3.34M | } | 237 | 0 | return NormLookupRet<T,From>(def, prev); | 238 | 3.34M | } |
|
239 | | |
240 | | template <class T> |
241 | | void free_norm_table(NormTable<T> * d) |
242 | 39.7k | { |
243 | 5.96M | for (T * cur = d->data; cur != d->end; ++cur) { |
244 | 5.92M | if (cur->sub_table) |
245 | 35.3k | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); |
246 | 5.92M | } |
247 | 39.7k | free(d); |
248 | 39.7k | } void acommon::free_norm_table<acommon::FromUniNormEntry>(acommon::NormTable<acommon::FromUniNormEntry>*) Line | Count | Source | 242 | 37.5k | { | 243 | 5.39M | for (T * cur = d->data; cur != d->end; ++cur) { | 244 | 5.35M | if (cur->sub_table) | 245 | 35.3k | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); | 246 | 5.35M | } | 247 | 37.5k | free(d); | 248 | 37.5k | } |
void acommon::free_norm_table<acommon::ToUniNormEntry>(acommon::NormTable<acommon::ToUniNormEntry>*) Line | Count | Source | 242 | 2.21k | { | 243 | 567k | for (T * cur = d->data; cur != d->end; ++cur) { | 244 | 565k | if (cur->sub_table) | 245 | 0 | free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table)); | 246 | 565k | } | 247 | 2.21k | free(d); | 248 | 2.21k | } |
|
249 | | |
250 | | struct FromUniNormEntry |
251 | | { |
252 | | typedef Uni32 From; |
253 | | Uni32 from; |
254 | | typedef byte To; |
255 | | byte to[4]; |
256 | | static const From from_non_char = (From)(-1); |
257 | | static const To to_non_char = 0x10; |
258 | | static const unsigned max_to = 4; |
259 | | void * sub_table; |
260 | | } |
261 | | #ifdef __GNUC__ |
262 | | __attribute__ ((aligned (16))) |
263 | | #endif |
264 | | ; |
265 | | |
266 | | struct ToUniNormEntry |
267 | | { |
268 | | typedef byte From; |
269 | | byte from; |
270 | | typedef Uni16 To; |
271 | | Uni16 to[3]; |
272 | | static const From from_non_char = 0x10; |
273 | | static const To to_non_char = 0x10; |
274 | | static const unsigned max_to = 3; |
275 | | void * sub_table; |
276 | | } |
277 | | #ifdef __GNUC__ |
278 | | __attribute__ ((aligned (16))) |
279 | | #endif |
280 | | ; |
281 | | |
282 | | ////////////////////////////////////////////////////////////////////// |
283 | | // |
284 | | // read in char data |
285 | | // |
286 | | |
287 | | PosibErr<void> read_in_char_data (const Config & config, |
288 | | ParmStr encoding, |
289 | | ToUniLookup & to, |
290 | | FromUniLookup & from) |
291 | 51 | { |
292 | 51 | to.reset(); |
293 | 51 | from.reset(); |
294 | | |
295 | 51 | String dir1,dir2,file_name; |
296 | 51 | fill_data_dir(&config, dir1, dir2); |
297 | 51 | find_file(file_name,dir1,dir2,encoding,".cset"); |
298 | | |
299 | 51 | FStream data; |
300 | 51 | PosibErrBase err = data.open(file_name, "r"); |
301 | 51 | if (err.get_err()) { |
302 | 15 | char mesg[300]; |
303 | 15 | snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), |
304 | 15 | file_name.c_str()); |
305 | 15 | return make_err(unknown_encoding, encoding, mesg); |
306 | 15 | } |
307 | 36 | unsigned chr; |
308 | 36 | Uni32 uni; |
309 | 36 | String line; |
310 | 36 | char * p; |
311 | 108 | do { |
312 | 108 | p = get_nb_line(data, line); |
313 | 108 | } while (*p != '/'); |
314 | 9.25k | for (chr = 0; chr != 256; ++chr) { |
315 | 9.21k | p = get_nb_line(data, line); |
316 | 9.21k | if (strtoul(p, 0, 16) != chr) |
317 | 0 | return make_err(bad_file_format, file_name); |
318 | 9.21k | uni = strtoul(p + 3, 0, 16); |
319 | 9.21k | to.insert(chr, uni); |
320 | 9.21k | from.insert(uni, chr); |
321 | 9.21k | } |
322 | | |
323 | 36 | return no_err; |
324 | 36 | } |
325 | | |
326 | | ////////////////////////////////////////////////////////////////////// |
327 | | // |
328 | | // read in norm data |
329 | | // |
330 | | |
331 | | struct Tally |
332 | | { |
333 | | int size; |
334 | | Uni32 mask; |
335 | | int max; |
336 | | int * data; |
337 | 119k | Tally(int s, int * d) : size(s), mask(s - 1), max(0), data(d) { |
338 | 119k | memset(data, 0, sizeof(int)*size); |
339 | 119k | } |
340 | 6.08M | void add(Uni32 chr) { |
341 | 6.08M | Uni32 p = chr & mask; |
342 | 6.08M | data[p]++; |
343 | 6.08M | if (data[p] > max) max = data[p]; |
344 | 6.08M | } |
345 | | }; |
346 | | |
347 | | # define sanity(check) \ |
348 | 10.4M | if (!(check)) return sanity_fail(__FILE__, FUNC, __LINE__, #check) |
349 | | |
350 | | static PosibErrBase sanity_fail(const char * file, const char * func, |
351 | | unsigned line, const char * check_str) |
352 | 0 | { |
353 | 0 | char mesg[500]; |
354 | 0 | snprintf(mesg, 500, "%s:%d: %s: Assertion \"%s\" failed.", |
355 | 0 | file, line, func, check_str); |
356 | 0 | return make_err(bad_input_error, mesg); |
357 | 0 | } |
358 | | # define CREATE_NORM_TABLE(T, in, buf, res) \ |
359 | 39.7k | do { PosibErr<NormTable<T> *> pe( create_norm_table<T>(in,buf) );\ |
360 | 39.7k | if (pe.has_err()) return PosibErrBase(pe); \ |
361 | 39.7k | res = pe.data; } while(false) |
362 | | |
363 | | template <class T> |
364 | | static PosibErr< NormTable<T> * > create_norm_table(IStream & in, String & buf) |
365 | 39.7k | { |
366 | 39.7k | const char FUNC[] = "create_norm_table"; |
367 | 39.7k | const char * p = get_nb_line(in, buf); |
368 | 39.7k | sanity(*p == 'N'); |
369 | 39.7k | ++p; |
370 | 39.7k | int size = strtoul(p, (char **)&p, 10); |
371 | 39.7k | VARARRAY(T, d, size); |
372 | 39.7k | memset(d, 0, sizeof(T) * size); |
373 | 39.7k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); |
374 | 39.7k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); |
375 | 39.7k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); |
376 | 39.7k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); |
377 | 39.7k | T * cur = d; |
378 | 2.06M | while (p = get_nb_line(in, buf), *p != '.') { |
379 | 2.02M | Uni32 f = strtoul(p, (char **)&p, 16); |
380 | 2.02M | cur->from = static_cast<typename T::From>(f); |
381 | 2.02M | sanity(f == cur->from); |
382 | 2.02M | tally0.add(f); |
383 | 2.02M | tally1.add(f); |
384 | 2.02M | tally2.add(f); |
385 | 2.02M | ++p; |
386 | 2.02M | sanity(*p == '>'); |
387 | 2.02M | ++p; |
388 | 2.02M | sanity(*p == ' '); |
389 | 2.02M | ++p; |
390 | 2.02M | unsigned i = 0; |
391 | 2.02M | if (*p != '-') { |
392 | 4.15M | for (;; ++i) { |
393 | 4.15M | const char * q = p; |
394 | 4.15M | Uni32 t = strtoul(p, (char **)&p, 16); |
395 | 4.15M | if (q == p) break; |
396 | 2.12M | sanity(i < d->max_to); |
397 | 2.12M | cur->to[i] = static_cast<typename T::To>(t); |
398 | 2.12M | sanity(t == static_cast<Uni32>(cur->to[i])); |
399 | 2.12M | } |
400 | 2.02M | } else { |
401 | 0 | cur->to[0] = 0; |
402 | 0 | cur->to[1] = T::to_non_char; |
403 | 0 | } |
404 | 2.02M | if (*p == ' ') ++p; |
405 | 2.02M | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); |
406 | 2.02M | ++cur; |
407 | 2.02M | } |
408 | 39.7k | sanity(cur - d == size); |
409 | 39.7k | Tally * which = &tally0; |
410 | 39.7k | if (which->max > tally1.max) which = &tally1; |
411 | 39.7k | if (which->max > tally2.max) which = &tally2; |
412 | 39.7k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + |
413 | 39.7k | sizeof(T) * which->size * which->max); |
414 | 39.7k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); |
415 | 39.7k | final->mask = which->size - 1; |
416 | 39.7k | final->height = which->size; |
417 | 39.7k | final->width = which->max; |
418 | 39.7k | final->end = final->data + which->size * which->max; |
419 | 39.7k | final->size = size; |
420 | 2.06M | for (cur = d; cur != d + size; ++cur) { |
421 | 2.02M | T * dest = final->data + (cur->from & final->mask); |
422 | 2.11M | while (dest->from != 0) dest += final->height; |
423 | 2.02M | *dest = *cur; |
424 | 2.02M | if (dest->from == 0) dest->from = T::from_non_char; |
425 | 2.02M | } |
426 | 93.9k | for (T * dest = final->data; dest < final->end; dest += final->height) { |
427 | 54.1k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { |
428 | 4.42k | dest->from = T::from_non_char; |
429 | 4.42k | dest->to[0] = T::to_non_char; |
430 | 4.42k | } |
431 | 54.1k | } |
432 | 39.7k | return final; |
433 | 39.7k | } convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::FromUniNormEntry>*> acommon::create_norm_table<acommon::FromUniNormEntry>(acommon::IStream&, acommon::String&) Line | Count | Source | 365 | 37.5k | { | 366 | 37.5k | const char FUNC[] = "create_norm_table"; | 367 | 37.5k | const char * p = get_nb_line(in, buf); | 368 | 37.5k | sanity(*p == 'N'); | 369 | 37.5k | ++p; | 370 | 37.5k | int size = strtoul(p, (char **)&p, 10); | 371 | 37.5k | VARARRAY(T, d, size); | 372 | 37.5k | memset(d, 0, sizeof(T) * size); | 373 | 37.5k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); | 374 | 37.5k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); | 375 | 37.5k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); | 376 | 37.5k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); | 377 | 37.5k | T * cur = d; | 378 | 1.50M | while (p = get_nb_line(in, buf), *p != '.') { | 379 | 1.46M | Uni32 f = strtoul(p, (char **)&p, 16); | 380 | 1.46M | cur->from = static_cast<typename T::From>(f); | 381 | 1.46M | sanity(f == cur->from); | 382 | 1.46M | tally0.add(f); | 383 | 1.46M | tally1.add(f); | 384 | 1.46M | tally2.add(f); | 385 | 1.46M | ++p; | 386 | 1.46M | sanity(*p == '>'); | 387 | 1.46M | ++p; | 388 | 1.46M | sanity(*p == ' '); | 389 | 1.46M | ++p; | 390 | 1.46M | unsigned i = 0; | 391 | 1.46M | if (*p != '-') { | 392 | 2.96M | for (;; ++i) { | 393 | 2.96M | const char * q = p; | 394 | 2.96M | Uni32 t = strtoul(p, (char **)&p, 16); | 395 | 2.96M | if (q == p) break; | 396 | 1.50M | sanity(i < d->max_to); | 397 | 1.50M | cur->to[i] = static_cast<typename T::To>(t); | 398 | 1.50M | sanity(t == static_cast<Uni32>(cur->to[i])); | 399 | 1.50M | } | 400 | 1.46M | } else { | 401 | 0 | cur->to[0] = 0; | 402 | 0 | cur->to[1] = T::to_non_char; | 403 | 0 | } | 404 | 1.46M | if (*p == ' ') ++p; | 405 | 1.46M | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); | 406 | 1.46M | ++cur; | 407 | 1.46M | } | 408 | 37.5k | sanity(cur - d == size); | 409 | 37.5k | Tally * which = &tally0; | 410 | 37.5k | if (which->max > tally1.max) which = &tally1; | 411 | 37.5k | if (which->max > tally2.max) which = &tally2; | 412 | 37.5k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + | 413 | 37.5k | sizeof(T) * which->size * which->max); | 414 | 37.5k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); | 415 | 37.5k | final->mask = which->size - 1; | 416 | 37.5k | final->height = which->size; | 417 | 37.5k | final->width = which->max; | 418 | 37.5k | final->end = final->data + which->size * which->max; | 419 | 37.5k | final->size = size; | 420 | 1.50M | for (cur = d; cur != d + size; ++cur) { | 421 | 1.46M | T * dest = final->data + (cur->from & final->mask); | 422 | 1.54M | while (dest->from != 0) dest += final->height; | 423 | 1.46M | *dest = *cur; | 424 | 1.46M | if (dest->from == 0) dest->from = T::from_non_char; | 425 | 1.46M | } | 426 | 89.5k | for (T * dest = final->data; dest < final->end; dest += final->height) { | 427 | 51.9k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { | 428 | 2.21k | dest->from = T::from_non_char; | 429 | 2.21k | dest->to[0] = T::to_non_char; | 430 | 2.21k | } | 431 | 51.9k | } | 432 | 37.5k | return final; | 433 | 37.5k | } |
convert.cpp:acommon::PosibErr<acommon::NormTable<acommon::ToUniNormEntry>*> acommon::create_norm_table<acommon::ToUniNormEntry>(acommon::IStream&, acommon::String&) Line | Count | Source | 365 | 2.21k | { | 366 | 2.21k | const char FUNC[] = "create_norm_table"; | 367 | 2.21k | const char * p = get_nb_line(in, buf); | 368 | 2.21k | sanity(*p == 'N'); | 369 | 2.21k | ++p; | 370 | 2.21k | int size = strtoul(p, (char **)&p, 10); | 371 | 2.21k | VARARRAY(T, d, size); | 372 | 2.21k | memset(d, 0, sizeof(T) * size); | 373 | 2.21k | int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); | 374 | 2.21k | VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); | 375 | 2.21k | VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); | 376 | 2.21k | VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); | 377 | 2.21k | T * cur = d; | 378 | 567k | while (p = get_nb_line(in, buf), *p != '.') { | 379 | 565k | Uni32 f = strtoul(p, (char **)&p, 16); | 380 | 565k | cur->from = static_cast<typename T::From>(f); | 381 | 565k | sanity(f == cur->from); | 382 | 565k | tally0.add(f); | 383 | 565k | tally1.add(f); | 384 | 565k | tally2.add(f); | 385 | 565k | ++p; | 386 | 565k | sanity(*p == '>'); | 387 | 565k | ++p; | 388 | 565k | sanity(*p == ' '); | 389 | 565k | ++p; | 390 | 565k | unsigned i = 0; | 391 | 565k | if (*p != '-') { | 392 | 1.19M | for (;; ++i) { | 393 | 1.19M | const char * q = p; | 394 | 1.19M | Uni32 t = strtoul(p, (char **)&p, 16); | 395 | 1.19M | if (q == p) break; | 396 | 624k | sanity(i < d->max_to); | 397 | 624k | cur->to[i] = static_cast<typename T::To>(t); | 398 | 624k | sanity(t == static_cast<Uni32>(cur->to[i])); | 399 | 624k | } | 400 | 565k | } else { | 401 | 0 | cur->to[0] = 0; | 402 | 0 | cur->to[1] = T::to_non_char; | 403 | 0 | } | 404 | 565k | if (*p == ' ') ++p; | 405 | 565k | if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table); | 406 | 565k | ++cur; | 407 | 565k | } | 408 | 2.21k | sanity(cur - d == size); | 409 | 2.21k | Tally * which = &tally0; | 410 | 2.21k | if (which->max > tally1.max) which = &tally1; | 411 | 2.21k | if (which->max > tally2.max) which = &tally2; | 412 | 2.21k | NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size + | 413 | 2.21k | sizeof(T) * which->size * which->max); | 414 | 2.21k | memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max); | 415 | 2.21k | final->mask = which->size - 1; | 416 | 2.21k | final->height = which->size; | 417 | 2.21k | final->width = which->max; | 418 | 2.21k | final->end = final->data + which->size * which->max; | 419 | 2.21k | final->size = size; | 420 | 567k | for (cur = d; cur != d + size; ++cur) { | 421 | 565k | T * dest = final->data + (cur->from & final->mask); | 422 | 565k | while (dest->from != 0) dest += final->height; | 423 | 565k | *dest = *cur; | 424 | 565k | if (dest->from == 0) dest->from = T::from_non_char; | 425 | 565k | } | 426 | 4.42k | for (T * dest = final->data; dest < final->end; dest += final->height) { | 427 | 2.21k | if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) { | 428 | 2.21k | dest->from = T::from_non_char; | 429 | 2.21k | dest->to[0] = T::to_non_char; | 430 | 2.21k | } | 431 | 2.21k | } | 432 | 2.21k | return final; | 433 | 2.21k | } |
|
434 | | |
435 | | static PosibErr<void> init_norm_tables(FStream & in, NormTables * d) |
436 | 1.10k | { |
437 | 1.10k | const char FUNC[] = "init_norm_tables"; |
438 | 1.10k | String l; |
439 | 1.10k | get_nb_line(in, l); |
440 | 1.10k | remove_comments(l); |
441 | 1.10k | sanity (l == "INTERNAL"); |
442 | 1.10k | get_nb_line(in, l); |
443 | 1.10k | remove_comments(l); |
444 | 1.10k | sanity (l == "/"); |
445 | 1.10k | CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->internal); |
446 | 1.10k | get_nb_line(in, l); |
447 | 1.10k | remove_comments(l); |
448 | 1.10k | sanity (l == "STRICT"); |
449 | 1.10k | char * p = get_nb_line(in, l); |
450 | 1.10k | remove_comments(l); |
451 | 1.10k | if (l == "/") { |
452 | 1.10k | CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->strict_d); |
453 | 1.10k | d->strict = d->strict_d; |
454 | 1.10k | } else { |
455 | 0 | sanity(*p == '='); |
456 | 0 | ++p; ++p; |
457 | 0 | sanity(strcmp(p, "INTERNAL") == 0); |
458 | 0 | d->strict = d->internal; |
459 | 0 | } |
460 | 4.42k | while (get_nb_line(in, l)) { |
461 | 3.31k | remove_comments(l); |
462 | 3.31k | d->to_uni.push_back(NormTables::ToUniTable()); |
463 | 3.31k | NormTables::ToUniTable & e = d->to_uni.back(); |
464 | 3.31k | e.name.resize(l.size()); |
465 | 14.3k | for (unsigned i = 0; i != l.size(); ++i) |
466 | 11.0k | e.name[i] = asc_tolower(l[i]); |
467 | 3.31k | char * p = get_nb_line(in, l); |
468 | 3.31k | remove_comments(l); |
469 | 3.31k | if (l == "/") { |
470 | 2.21k | CREATE_NORM_TABLE(ToUniNormEntry, in, l, e.data); |
471 | 2.21k | e.ptr = e.data; |
472 | 2.21k | } else { |
473 | 1.10k | sanity(*p == '='); |
474 | 1.10k | ++p; ++p; |
475 | 4.42k | for (char * q = p; *q; ++q) *q = asc_tolower(*q); |
476 | 1.10k | Vector<NormTables::ToUniTable>::iterator i = d->to_uni.begin(); |
477 | 2.21k | while (i->name != p && i != d->to_uni.end()) ++i; |
478 | 1.10k | sanity(i != d->to_uni.end()); |
479 | 1.10k | e.ptr = i->ptr; |
480 | 1.10k | get_nb_line(in, l); |
481 | 1.10k | } |
482 | 3.31k | } |
483 | 1.10k | return no_err; |
484 | 1.10k | } |
485 | | |
486 | | PosibErr<NormTables *> NormTables::get_new(const String & encoding, |
487 | | const Config * config) |
488 | 1.10k | { |
489 | 1.10k | String dir1,dir2,file_name; |
490 | 1.10k | fill_data_dir(config, dir1, dir2); |
491 | 1.10k | find_file(file_name,dir1,dir2,encoding,".cmap"); |
492 | | |
493 | 1.10k | FStream in; |
494 | 1.10k | PosibErrBase err = in.open(file_name, "r"); |
495 | 1.10k | if (err.get_err()) { |
496 | 0 | char mesg[300]; |
497 | 0 | snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), |
498 | 0 | file_name.c_str()); |
499 | 0 | return make_err(unknown_encoding, encoding, mesg); // FIXME |
500 | 0 | } |
501 | | |
502 | 1.10k | NormTables * d = new NormTables; |
503 | 1.10k | d->key = encoding; |
504 | 1.10k | err = init_norm_tables(in, d); |
505 | 1.10k | if (err.has_err()) { |
506 | 0 | return make_err(bad_file_format, file_name, err.get_err()->mesg); |
507 | 0 | } |
508 | | |
509 | 1.10k | return d; |
510 | | |
511 | 1.10k | } |
512 | | |
513 | | NormTables::~NormTables() |
514 | 1.10k | { |
515 | 1.10k | free_norm_table<FromUniNormEntry>(internal); |
516 | 1.10k | if (strict_d) |
517 | 1.10k | free_norm_table<FromUniNormEntry>(strict_d); |
518 | 4.42k | for (unsigned i = 0; i != to_uni.size(); ++i) { |
519 | 3.31k | if (to_uni[i].data) |
520 | 2.21k | free_norm_table<ToUniNormEntry>(to_uni[i].data); |
521 | 3.31k | } |
522 | 1.10k | } |
523 | | |
524 | | ////////////////////////////////////////////////////////////////////// |
525 | | ////////////////////////////////////////////////////////////////////// |
526 | | // |
527 | | // Convert |
528 | | // |
529 | | ////////////////////////////////////////////////////////////////////// |
530 | | ////////////////////////////////////////////////////////////////////// |
531 | | |
532 | | |
533 | | bool operator== (const Convert & rhs, const Convert & lhs) |
534 | 0 | { |
535 | 0 | return strcmp(rhs.in_code(), lhs.in_code()) == 0 |
536 | 0 | && strcmp(rhs.out_code(), lhs.out_code()) == 0; |
537 | 0 | } |
538 | | |
539 | | ////////////////////////////////////////////////////////////////////// |
540 | | // |
541 | | // Trivial Conversion |
542 | | // |
543 | | |
544 | | const char * unsupported_null_term_wide_string_msg = |
545 | | "Null-terminated wide-character strings unsupported when used this way."; |
546 | | |
547 | | template <typename Chr> |
548 | | struct DecodeDirect : public Decode |
549 | | { |
550 | 1.23k | DecodeDirect() {type_width = sizeof(Chr);}acommon::DecodeDirect<unsigned char>::DecodeDirect() Line | Count | Source | 550 | 1.13k | DecodeDirect() {type_width = sizeof(Chr);} |
acommon::DecodeDirect<unsigned short>::DecodeDirect() Line | Count | Source | 550 | 34 | DecodeDirect() {type_width = sizeof(Chr);} |
acommon::DecodeDirect<unsigned int>::DecodeDirect() Line | Count | Source | 550 | 73 | DecodeDirect() {type_width = sizeof(Chr);} |
|
551 | 338k | void decode(const char * in0, int size, FilterCharVector & out) const { |
552 | 338k | const Chr * in = reinterpret_cast<const Chr *>(in0); |
553 | 338k | if (size == -sizeof(Chr)) { |
554 | 1.12M | for (;*in; ++in) |
555 | 805k | out.append(*in, sizeof(Chr)); |
556 | 317k | } else if (size <= -1) { |
557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); |
558 | 0 | abort(); |
559 | 20.9k | } else { |
560 | 20.9k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); |
561 | 46.7M | for (;in != stop; ++in) |
562 | 46.7M | out.append(*in, sizeof(Chr)); |
563 | 20.9k | } |
564 | 338k | } acommon::DecodeDirect<unsigned char>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 334k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 334k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 334k | if (size == -sizeof(Chr)) { | 554 | 1.12M | for (;*in; ++in) | 555 | 805k | out.append(*in, sizeof(Chr)); | 556 | 317k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 16.9k | } else { | 560 | 16.9k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 46.3M | for (;in != stop; ++in) | 562 | 46.3M | out.append(*in, sizeof(Chr)); | 563 | 16.9k | } | 564 | 334k | } |
acommon::DecodeDirect<unsigned short>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 2.25k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 2.25k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 2.25k | if (size == -sizeof(Chr)) { | 554 | 0 | for (;*in; ++in) | 555 | 0 | out.append(*in, sizeof(Chr)); | 556 | 2.25k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 2.25k | } else { | 560 | 2.25k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 309k | for (;in != stop; ++in) | 562 | 307k | out.append(*in, sizeof(Chr)); | 563 | 2.25k | } | 564 | 2.25k | } |
acommon::DecodeDirect<unsigned int>::decode(char const*, int, acommon::FilterCharVector&) const Line | Count | Source | 551 | 1.70k | void decode(const char * in0, int size, FilterCharVector & out) const { | 552 | 1.70k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 553 | 1.70k | if (size == -sizeof(Chr)) { | 554 | 0 | for (;*in; ++in) | 555 | 0 | out.append(*in, sizeof(Chr)); | 556 | 1.70k | } else if (size <= -1) { | 557 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 558 | 0 | abort(); | 559 | 1.70k | } else { | 560 | 1.70k | const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr); | 561 | 142k | for (;in != stop; ++in) | 562 | 140k | out.append(*in, sizeof(Chr)); | 563 | 1.70k | } | 564 | 1.70k | } |
|
565 | | PosibErr<void> decode_ec(const char * in0, int size, |
566 | 0 | FilterCharVector & out, ParmStr) const { |
567 | 0 | DecodeDirect::decode(in0, size, out); |
568 | 0 | return no_err; |
569 | 0 | } Unexecuted instantiation: acommon::DecodeDirect<unsigned char>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const Unexecuted instantiation: acommon::DecodeDirect<unsigned short>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const Unexecuted instantiation: acommon::DecodeDirect<unsigned int>::decode_ec(char const*, int, acommon::FilterCharVector&, acommon::ParmString const&) const |
570 | | }; |
571 | | |
572 | | template <typename Chr> |
573 | | struct EncodeDirect : public Encode |
574 | | { |
575 | 1.27k | EncodeDirect() {type_width = sizeof(Chr);}acommon::EncodeDirect<unsigned char>::EncodeDirect() Line | Count | Source | 575 | 1.13k | EncodeDirect() {type_width = sizeof(Chr);} |
acommon::EncodeDirect<unsigned short>::EncodeDirect() Line | Count | Source | 575 | 34 | EncodeDirect() {type_width = sizeof(Chr);} |
acommon::EncodeDirect<unsigned int>::EncodeDirect() Line | Count | Source | 575 | 113 | EncodeDirect() {type_width = sizeof(Chr);} |
|
576 | | void encode(const FilterChar * in, const FilterChar * stop, |
577 | 1.28M | CharVector & out) const { |
578 | 5.53M | for (; in != stop; ++in) { |
579 | 4.25M | Chr c = in->chr; |
580 | 4.25M | if (c != in->chr) c = '?'; |
581 | 4.25M | out.append(&c, sizeof(Chr)); |
582 | 4.25M | } |
583 | 1.28M | } acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 970k | CharVector & out) const { | 578 | 4.41M | for (; in != stop; ++in) { | 579 | 3.44M | Chr c = in->chr; | 580 | 3.44M | if (c != in->chr) c = '?'; | 581 | 3.44M | out.append(&c, sizeof(Chr)); | 582 | 3.44M | } | 583 | 970k | } |
acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 211k | CharVector & out) const { | 578 | 838k | for (; in != stop; ++in) { | 579 | 627k | Chr c = in->chr; | 580 | 627k | if (c != in->chr) c = '?'; | 581 | 627k | out.append(&c, sizeof(Chr)); | 582 | 627k | } | 583 | 211k | } |
acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&) const Line | Count | Source | 577 | 106k | CharVector & out) const { | 578 | 284k | for (; in != stop; ++in) { | 579 | 178k | Chr c = in->chr; | 580 | 178k | if (c != in->chr) c = '?'; | 581 | 178k | out.append(&c, sizeof(Chr)); | 582 | 178k | } | 583 | 106k | } |
|
584 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
585 | 3.73k | CharVector & out, ParmStr orig) const { |
586 | 12.5k | for (; in != stop; ++in) { |
587 | 8.84k | Chr c = in->chr; |
588 | 8.84k | if (c != in->chr) { |
589 | 0 | char m[70]; |
590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
591 | 0 | return make_err(invalid_string, orig, m); |
592 | 0 | } |
593 | | |
594 | 8.84k | out.append(&c, sizeof(Chr)); |
595 | 8.84k | } |
596 | 3.73k | return no_err; |
597 | 3.73k | } acommon::EncodeDirect<unsigned char>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const Line | Count | Source | 585 | 1.97k | CharVector & out, ParmStr orig) const { | 586 | 5.93k | for (; in != stop; ++in) { | 587 | 3.95k | Chr c = in->chr; | 588 | 3.95k | if (c != in->chr) { | 589 | 0 | char m[70]; | 590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); | 591 | 0 | return make_err(invalid_string, orig, m); | 592 | 0 | } | 593 | | | 594 | 3.95k | out.append(&c, sizeof(Chr)); | 595 | 3.95k | } | 596 | 1.97k | return no_err; | 597 | 1.97k | } |
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const acommon::EncodeDirect<unsigned int>::encode_ec(acommon::FilterChar const*, acommon::FilterChar const*, acommon::String&, acommon::ParmString const&) const Line | Count | Source | 585 | 1.75k | CharVector & out, ParmStr orig) const { | 586 | 6.64k | for (; in != stop; ++in) { | 587 | 4.88k | Chr c = in->chr; | 588 | 4.88k | if (c != in->chr) { | 589 | 0 | char m[70]; | 590 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); | 591 | 0 | return make_err(invalid_string, orig, m); | 592 | 0 | } | 593 | | | 594 | 4.88k | out.append(&c, sizeof(Chr)); | 595 | 4.88k | } | 596 | 1.75k | return no_err; | 597 | 1.75k | } |
|
598 | 84 | bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const { |
599 | 84 | return true; |
600 | 84 | } acommon::EncodeDirect<unsigned char>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const Line | Count | Source | 598 | 84 | bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const { | 599 | 84 | return true; | 600 | 84 | } |
Unexecuted instantiation: acommon::EncodeDirect<unsigned short>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const Unexecuted instantiation: acommon::EncodeDirect<unsigned int>::encode(acommon::FilterChar*&, acommon::FilterChar*&, acommon::FilterCharVector&) const |
601 | | }; |
602 | | |
603 | | template <typename Chr> |
604 | | struct ConvDirect : public DirectConv |
605 | | { |
606 | 30 | ConvDirect() {type_width = sizeof(Chr);}Unexecuted instantiation: acommon::ConvDirect<unsigned short>::ConvDirect() Unexecuted instantiation: acommon::ConvDirect<unsigned int>::ConvDirect() acommon::ConvDirect<char>::ConvDirect() Line | Count | Source | 606 | 30 | ConvDirect() {type_width = sizeof(Chr);} |
|
607 | 18.0k | void convert(const char * in0, int size, CharVector & out) const { |
608 | 18.0k | if (size == -sizeof(Chr)) { |
609 | 17.7k | const Chr * in = reinterpret_cast<const Chr *>(in0); |
610 | 70.3k | for (;*in != 0; ++in) |
611 | 52.5k | out.append(in, sizeof(Chr)); |
612 | 17.7k | } else if (size <= -1) { |
613 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); |
614 | 0 | abort(); |
615 | 258 | } else { |
616 | 258 | out.append(in0, size); |
617 | 258 | } |
618 | 18.0k | } Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert(char const*, int, acommon::String&) const Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert(char const*, int, acommon::String&) const acommon::ConvDirect<char>::convert(char const*, int, acommon::String&) const Line | Count | Source | 607 | 18.0k | void convert(const char * in0, int size, CharVector & out) const { | 608 | 18.0k | if (size == -sizeof(Chr)) { | 609 | 17.7k | const Chr * in = reinterpret_cast<const Chr *>(in0); | 610 | 70.3k | for (;*in != 0; ++in) | 611 | 52.5k | out.append(in, sizeof(Chr)); | 612 | 17.7k | } else if (size <= -1) { | 613 | 0 | fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg); | 614 | 0 | abort(); | 615 | 258 | } else { | 616 | 258 | out.append(in0, size); | 617 | 258 | } | 618 | 18.0k | } |
|
619 | | PosibErr<void> convert_ec(const char * in0, int size, |
620 | 0 | CharVector & out, ParmStr) const { |
621 | 0 | ConvDirect::convert(in0, size, out); |
622 | 0 | return no_err; |
623 | 0 | } Unexecuted instantiation: acommon::ConvDirect<unsigned short>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const Unexecuted instantiation: acommon::ConvDirect<unsigned int>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const Unexecuted instantiation: acommon::ConvDirect<char>::convert_ec(char const*, int, acommon::String&, acommon::ParmString const&) const |
624 | | }; |
625 | | |
626 | | ////////////////////////////////////////////////////////////////////// |
627 | | // |
628 | | // Lookup Conversion |
629 | | // |
630 | | |
631 | | struct DecodeLookup : public Decode |
632 | | { |
633 | | ToUniLookup lookup; |
634 | 33 | PosibErr<void> init(ParmStr code, const Config & c) { |
635 | 33 | FromUniLookup unused; |
636 | 33 | return read_in_char_data(c, code, lookup, unused); |
637 | 33 | } |
638 | 812 | void decode(const char * in, int size, FilterCharVector & out) const { |
639 | 812 | if (size == -1) { |
640 | 0 | for (;*in; ++in) |
641 | 0 | out.append(lookup[*in]); |
642 | 812 | } else { |
643 | 812 | const char * stop = in + size; |
644 | 392k | for (;in != stop; ++in) |
645 | 391k | out.append(lookup[*in]); |
646 | 812 | } |
647 | 812 | } |
648 | | PosibErr<void> decode_ec(const char * in, int size, |
649 | 0 | FilterCharVector & out, ParmStr) const { |
650 | 0 | DecodeLookup::decode(in, size, out); |
651 | 0 | return no_err; |
652 | 0 | } |
653 | | }; |
654 | | |
655 | | struct DecodeNormLookup : public Decode |
656 | | { |
657 | | typedef ToUniNormEntry E; |
658 | | NormTable<E> * data; |
659 | 9.89k | DecodeNormLookup(NormTable<E> * d) : data(d) {} |
660 | | // must be null terminated |
661 | | // FIXME: Why must it be null terminated? |
662 | 980k | void decode(const char * in, int size, FilterCharVector & out) const { |
663 | 980k | const char * stop = in + size; // will work even if size -1 |
664 | 4.32M | while (in != stop) { |
665 | 4.32M | if (*in == 0) { |
666 | 980k | if (size == -1) break; |
667 | 0 | out.append(0); |
668 | 0 | ++in; |
669 | 3.34M | } else { |
670 | 3.34M | NormLookupRet<E,const char> ret = norm_lookup<E>(data, in, stop, 0, in); |
671 | 6.68M | for (unsigned i = 0; ret.to[i] && i < E::max_to; ++i) |
672 | 3.34M | out.append(ret.to[i]); |
673 | 3.34M | in = ret.last + 1; |
674 | 3.34M | } |
675 | 4.32M | } |
676 | 980k | } |
677 | | PosibErr<void> decode_ec(const char * in, int size, |
678 | 0 | FilterCharVector & out, ParmStr) const { |
679 | 0 | DecodeNormLookup::decode(in, size, out); |
680 | 0 | return no_err; |
681 | 0 | } |
682 | | }; |
683 | | |
684 | | struct EncodeLookup : public Encode |
685 | | { |
686 | | FromUniLookup lookup; |
687 | | PosibErr<void> init(ParmStr code, const Config & c) |
688 | 18 | {ToUniLookup unused; |
689 | 18 | return read_in_char_data(c, code, unused, lookup);} |
690 | | void encode(const FilterChar * in, const FilterChar * stop, |
691 | 14.2k | CharVector & out) const { |
692 | 67.2k | for (; in != stop; ++in) { |
693 | 53.0k | out.append(lookup(*in)); |
694 | 53.0k | } |
695 | 14.2k | } |
696 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
697 | 0 | CharVector & out, ParmStr orig) const { |
698 | 0 | for (; in != stop; ++in) { |
699 | 0 | char c = lookup(*in, '\0'); |
700 | 0 | if (c == '\0' && in->chr != 0) { |
701 | 0 | char m[70]; |
702 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
703 | 0 | return make_err(invalid_string, orig, m); |
704 | 0 | } |
705 | 0 | out.append(c); |
706 | 0 | } |
707 | 0 | return no_err; |
708 | 0 | } |
709 | | bool encode(FilterChar * & in0, FilterChar * & stop, |
710 | 0 | FilterCharVector & out) const { |
711 | 0 | FilterChar * in = in0; |
712 | 0 | for (; in != stop; ++in) |
713 | 0 | *in = lookup(*in); |
714 | 0 | return true; |
715 | 0 | } |
716 | | }; |
717 | | |
718 | | struct EncodeNormLookup : public Encode |
719 | | { |
720 | | typedef FromUniNormEntry E; |
721 | | NormTable<E> * data; |
722 | 3.29k | EncodeNormLookup(NormTable<E> * d) : data(d) {} |
723 | | // *stop must equal 0 |
724 | | void encode(const FilterChar * in, const FilterChar * stop, |
725 | 16.5k | CharVector & out) const { |
726 | 16.4M | while (in < stop) { |
727 | 16.3M | if (*in == 0) { |
728 | 0 | out.append('\0'); |
729 | 0 | ++in; |
730 | 16.3M | } else { |
731 | 16.3M | NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in); |
732 | 32.7M | for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) |
733 | 16.3M | out.append(ret.to[i]); |
734 | 16.3M | in = ret.last + 1; |
735 | 16.3M | } |
736 | 16.3M | } |
737 | 16.5k | } |
738 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
739 | 24.9k | CharVector & out, ParmStr orig) const { |
740 | 74.9k | while (in < stop) { |
741 | 49.9k | if (*in == 0) { |
742 | 0 | out.append('\0'); |
743 | 0 | ++in; |
744 | 49.9k | } else { |
745 | 49.9k | NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, 0, in); |
746 | 49.9k | if (ret.to == 0) { |
747 | 0 | char m[70]; |
748 | 0 | snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); |
749 | 0 | return make_err(invalid_string, orig, m); |
750 | 0 | } |
751 | 99.9k | for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) |
752 | 49.9k | out.append(ret.to[i]); |
753 | 49.9k | in = ret.last + 1; |
754 | 49.9k | } |
755 | 49.9k | } |
756 | 24.9k | return no_err; |
757 | 24.9k | } |
758 | | bool encode(FilterChar * & in, FilterChar * & stop, |
759 | 1.06k | FilterCharVector & buf) const { |
760 | 1.06k | buf.clear(); |
761 | 30.1M | while (in < stop) { |
762 | 30.1M | if (*in == 0) { |
763 | 3.49M | buf.append(FilterChar(0)); |
764 | 3.49M | ++in; |
765 | 26.7M | } else { |
766 | 26.7M | NormLookupRet<E,FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in); |
767 | 26.7M | const FilterChar * end = ret.last + 1; |
768 | 26.7M | unsigned width = 0; |
769 | 53.4M | for (; in != end; ++in) width += in->width; |
770 | 26.7M | buf.append(FilterChar(ret.to[0], width)); |
771 | 26.7M | for (unsigned i = 1; i < E::max_to && ret.to[i]; ++i) { |
772 | 0 | buf.append(FilterChar(ret.to[i],0)); |
773 | 0 | } |
774 | 26.7M | } |
775 | 30.1M | } |
776 | 1.06k | buf.append(0); |
777 | 1.06k | in = buf.pbegin(); |
778 | 1.06k | stop = buf.pend(); |
779 | 1.06k | return true; |
780 | 1.06k | } |
781 | | }; |
782 | | |
783 | | ////////////////////////////////////////////////////////////////////// |
784 | | // |
785 | | // UTF8 |
786 | | // |
787 | | |
788 | | #define get_check_next \ |
789 | 1.03k | if (in == stop) goto error; \ |
790 | 1.03k | c = *in; \ |
791 | 1.03k | if ((c & 0xC0/*1100 0000*/) != 0x80/*10xx xxxx*/) goto error;\ |
792 | 1.03k | ++in; \ |
793 | 889 | u <<= 6; \ |
794 | 889 | u |= c & 0x3F/*0011 1111*/; \ |
795 | 889 | ++w; |
796 | | |
797 | | static inline FilterChar from_utf8 (const char * & in, const char * stop = 0, |
798 | | Uni32 err_char = '?') |
799 | 70.9k | { |
800 | 70.9k | Uni32 u = (Uni32)(-1); |
801 | 70.9k | FilterChar::Width w = 1; |
802 | | |
803 | | // the first char is guaranteed not to be off the end |
804 | 70.9k | char c = *in; |
805 | 70.9k | ++in; |
806 | | |
807 | 70.9k | if ((c & 0x80/*1000 0000*/) == 0x00/*0xxx xxx*/) { |
808 | 70.1k | u = c; |
809 | 70.1k | } else if ((c & 0xE0/*1110 0000*/) == 0xC0/*110x xxxx*/) { // 2-byte wide |
810 | 105 | u = c & 0x1F/*0001 1111*/; |
811 | 156 | get_check_next; |
812 | 699 | } else if ((c & 0xF0/*1111 0000*/) == 0xE0/*1110 xxxx*/) { // 3-byte wide |
813 | 254 | u = c & 0x0F/*0000 1111*/; |
814 | 501 | get_check_next; |
815 | 501 | get_check_next; |
816 | 488 | } else if ((c & 0xF8/*1111 1000*/) == 0xF0/*1111 0xxx*/) { // 4-byte wide |
817 | 193 | u = c & 0x07/*0000 0111*/; |
818 | 310 | get_check_next; |
819 | 310 | get_check_next; |
820 | 234 | get_check_next; |
821 | 252 | } else { |
822 | 252 | goto error; |
823 | 252 | } |
824 | | |
825 | 70.5k | return FilterChar(u, w); |
826 | 396 | error: |
827 | 396 | return FilterChar(err_char, w); |
828 | 70.9k | } |
829 | | |
830 | | static inline void to_utf8 (FilterChar in, CharVector & out) |
831 | 834 | { |
832 | 834 | FilterChar::Chr c = in; |
833 | | |
834 | 834 | if (c < 0x80) { |
835 | 834 | out.append(c); |
836 | 834 | } |
837 | 0 | else if (c < 0x800) { |
838 | 0 | out.append(0xC0 | (c>>6)); |
839 | 0 | out.append(0x80 | (c & 0x3F)); |
840 | 0 | } |
841 | 0 | else if (c < 0x10000) { |
842 | 0 | out.append(0xE0 | (c>>12)); |
843 | 0 | out.append(0x80 | (c>>6 & 0x3F)); |
844 | 0 | out.append(0x80 | (c & 0x3F)); |
845 | 0 | } |
846 | 0 | else if (c < 0x200000) { |
847 | 0 | out.append(0xF0 | (c>>18)); |
848 | 0 | out.append(0x80 | (c>>12 & 0x3F)); |
849 | 0 | out.append(0x80 | (c>>6 & 0x3F)); |
850 | 0 | out.append(0x80 | (c & 0x3F)); |
851 | 0 | } |
852 | 834 | } |
853 | | |
854 | | struct DecodeUtf8 : public Decode |
855 | | { |
856 | | ToUniLookup lookup; |
857 | 6 | void decode(const char * in, int size, FilterCharVector & out) const { |
858 | 6 | if (size == -1) { |
859 | 0 | while (*in) |
860 | 0 | out.append(from_utf8(in)); |
861 | 6 | } else { |
862 | 6 | const char * stop = in + size; |
863 | 12.1k | while (in != stop) |
864 | 12.1k | out.append(from_utf8(in, stop)); |
865 | 6 | } |
866 | 6 | } |
867 | | PosibErr<void> decode_ec(const char * in, int size, |
868 | 28.7k | FilterCharVector & out, ParmStr orig) const { |
869 | 28.7k | const char * begin = in; |
870 | 28.7k | if (size == -1) { |
871 | 6.67k | while (*in) { |
872 | 4.91k | FilterChar c = from_utf8(in, 0, (Uni32)-1); |
873 | 4.91k | if (c == (Uni32)-1) goto error; |
874 | 4.90k | out.append(c); |
875 | 4.90k | } |
876 | 26.9k | } else { |
877 | 26.9k | const char * stop = in + size; |
878 | 80.9k | while (in != stop) { |
879 | 53.9k | FilterChar c = from_utf8(in, stop, (Uni32)-1); |
880 | 53.9k | if (c == (Uni32)-1) goto error; |
881 | 53.9k | out.append(c); |
882 | 53.9k | } |
883 | 26.9k | } |
884 | 28.7k | return no_err; |
885 | 9 | error: |
886 | 9 | char m[70]; |
887 | 9 | snprintf(m, 70, _("Invalid UTF-8 sequence at position %ld."), (long)(in - begin)); |
888 | 9 | return make_err(invalid_string, orig, m); |
889 | 28.7k | } |
890 | | }; |
891 | | |
892 | | struct EncodeUtf8 : public Encode |
893 | | { |
894 | | FromUniLookup lookup; |
895 | | void encode(const FilterChar * in, const FilterChar * stop, |
896 | 220 | CharVector & out) const { |
897 | 1.05k | for (; in != stop; ++in) { |
898 | 834 | to_utf8(*in, out); |
899 | 834 | } |
900 | 220 | } |
901 | | PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop, |
902 | 0 | CharVector & out, ParmStr) const { |
903 | 0 | for (; in != stop; ++in) { |
904 | 0 | to_utf8(*in, out); |
905 | 0 | } |
906 | 0 | return no_err; |
907 | 0 | } |
908 | | }; |
909 | | |
910 | | ////////////////////////////////////////////////////////////////////// |
911 | | // |
912 | | // Cache |
913 | | // |
914 | | |
915 | | static GlobalCache<Decode> decode_cache("decode"); |
916 | | static GlobalCache<Encode> encode_cache("encode"); |
917 | | static GlobalCache<NormTables> norm_tables_cache("norm_tables"); |
918 | | |
919 | | ////////////////////////////////////////////////////////////////////// |
920 | | // |
921 | | // new_aspell_convert |
922 | | // |
923 | | |
924 | | void Convert::generic_convert(const char * in, int size, CharVector & out) |
925 | 4.66k | { |
926 | 4.66k | buf_.clear(); |
927 | 4.66k | decode_->decode(in, size, buf_); |
928 | 4.66k | FilterChar * start = buf_.pbegin(); |
929 | 4.66k | FilterChar * stop = buf_.pend(); |
930 | 4.66k | if (!filter.empty()) |
931 | 4.66k | filter.process(start, stop); |
932 | 4.66k | encode_->encode(start, stop, out); |
933 | 4.66k | } |
934 | | |
935 | | const char * fix_encoding_str(ParmStr enc, String & buf) |
936 | 51.3k | { |
937 | 51.3k | buf.clear(); |
938 | 51.3k | buf.reserve(enc.size() + 1); |
939 | 525k | for (size_t i = 0; i != enc.size(); ++i) |
940 | 473k | buf.push_back(asc_tolower(enc[i])); |
941 | | |
942 | 51.3k | if (strncmp(buf.c_str(), "iso8859", 7) == 0) |
943 | 3.51k | buf.insert(buf.begin() + 3, '-'); // For backwards compatibility |
944 | | |
945 | 51.3k | if (buf == "ascii" || buf == "ansi_x3.4-1968") |
946 | 0 | return "iso-8859-1"; |
947 | 51.3k | else if (buf == "machine unsigned 16" || buf == "utf-16") |
948 | 68 | return "ucs-2"; |
949 | 51.2k | else if (buf == "machine unsigned 32" || buf == "utf-32") |
950 | 146 | return "ucs-4"; |
951 | 51.1k | else |
952 | 51.1k | return buf.c_str(); |
953 | 51.3k | } |
954 | | |
955 | | bool ascii_encoding(const Config & c, ParmStr enc0) |
956 | 2.22k | { |
957 | 2.22k | if (enc0.empty()) return true; |
958 | 2.22k | if (enc0 == "ANSI_X3.4-1968" |
959 | 2.22k | || enc0 == "ASCII" || enc0 == "ascii") return true; |
960 | 0 | String buf; |
961 | 0 | const char * enc = fix_encoding_str(enc0, buf); |
962 | 0 | if (strcmp(enc, "utf-8") == 0 |
963 | 0 | || strcmp(enc, "ucs-2") == 0 |
964 | 0 | || strcmp(enc, "ucs-4") == 0) return false; |
965 | 0 | String dir1,dir2,file_name; |
966 | 0 | fill_data_dir(&c, dir1, dir2); |
967 | 0 | file_name << dir1 << enc << ".cset"; |
968 | 0 | if (file_exists(file_name)) return false; |
969 | 0 | if (dir1 == dir2) return true; |
970 | 0 | file_name.clear(); |
971 | 0 | file_name << dir2 << enc << ".cset"; |
972 | 0 | return !file_exists(file_name); |
973 | 0 | } |
974 | | |
975 | | PosibErr<Convert *> internal_new_convert(const Config & c, |
976 | | ConvKey in, |
977 | | ConvKey out, |
978 | | bool if_needed, |
979 | | Normalize norm) |
980 | 23.8k | { |
981 | 23.8k | String in_s; |
982 | 23.8k | in.val = fix_encoding_str(in.val, in_s); |
983 | | |
984 | 23.8k | String out_s; |
985 | 23.8k | out.val = fix_encoding_str(out.val, out_s); |
986 | | |
987 | 23.8k | if (if_needed && in.val == out.val) return 0; |
988 | | |
989 | 14.3k | StackPtr<Convert> conv(new Convert); |
990 | 14.3k | switch (norm) { |
991 | 45 | case NormNone: |
992 | 45 | RET_ON_ERR(conv->init(c, in, out)); break; |
993 | 3.57k | case NormFrom: |
994 | 3.57k | RET_ON_ERR(conv->init_norm_from(c, in, out)); break; |
995 | 10.7k | case NormTo: |
996 | 10.7k | RET_ON_ERR(conv->init_norm_to(c, in, out)); break; |
997 | 14.3k | } |
998 | 14.3k | return conv.release(); |
999 | 14.3k | } |
1000 | | |
1001 | | PosibErr<Decode *> Decode::get_new(const ConvKey & k, const Config * c) |
1002 | 2.46k | { |
1003 | 2.46k | StackPtr<Decode> ptr; |
1004 | 2.46k | if (k.val == "iso-8859-1") { |
1005 | 1.13k | ptr.reset(new DecodeDirect<Uni8>); |
1006 | 1.33k | } else if (k.val == "ucs-2") { |
1007 | 34 | if (k.allow_ucs) |
1008 | 34 | ptr.reset(new DecodeDirect<Uni16>); |
1009 | 0 | else |
1010 | 0 | return make_err(encoding_not_supported, k.val); |
1011 | 1.30k | } else if (k.val == "ucs-4") { |
1012 | 73 | if (k.allow_ucs) |
1013 | 73 | ptr.reset(new DecodeDirect<Uni32>); |
1014 | 0 | else |
1015 | 0 | return make_err(encoding_not_supported, k.val); |
1016 | 1.23k | } else if (k.val == "utf-8") { |
1017 | 1.19k | ptr.reset(new DecodeUtf8); |
1018 | 1.19k | } else { |
1019 | 33 | ptr.reset(new DecodeLookup); |
1020 | 33 | } |
1021 | 2.46k | RET_ON_ERR(ptr->init(k.val, *c)); |
1022 | 2.45k | ptr->key = k.val; |
1023 | 2.45k | return ptr.release(); |
1024 | 2.46k | } |
1025 | | |
1026 | | PosibErr<Encode *> Encode::get_new(const ConvKey & k, const Config * c) |
1027 | 2.49k | { |
1028 | 2.49k | StackPtr<Encode> ptr; |
1029 | 2.49k | if (k.val == "iso-8859-1") { |
1030 | 1.13k | ptr.reset(new EncodeDirect<Uni8>); |
1031 | 1.36k | } else if (k.val == "ucs-2" && k.allow_ucs) { |
1032 | 34 | if (k.allow_ucs) |
1033 | 34 | ptr.reset(new EncodeDirect<Uni16>); |
1034 | 0 | else |
1035 | 0 | return make_err(encoding_not_supported, k.val); |
1036 | 1.32k | } else if (k.val == "ucs-4" && k.allow_ucs) { |
1037 | 113 | if (k.allow_ucs) |
1038 | 113 | ptr.reset(new EncodeDirect<Uni32>); |
1039 | 0 | else |
1040 | 0 | return make_err(encoding_not_supported, k.val); |
1041 | 1.21k | } else if (k.val == "utf-8") { |
1042 | 1.19k | ptr.reset(new EncodeUtf8); |
1043 | 1.19k | } else { |
1044 | 18 | ptr.reset(new EncodeLookup); |
1045 | 18 | } |
1046 | 2.49k | RET_ON_ERR(ptr->init(k.val, *c)); |
1047 | 2.49k | ptr->key = k.val; |
1048 | 2.49k | return ptr.release(); |
1049 | 2.49k | } |
1050 | | |
1051 | 14.3k | Convert::~Convert() {} |
1052 | | |
1053 | | PosibErr<void> Convert::init(const Config & c, const ConvKey & in, const ConvKey & out) |
1054 | 1.12k | { |
1055 | 1.12k | RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); |
1056 | 1.11k | decode_ = decode_c.get(); |
1057 | 1.11k | RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); |
1058 | 1.11k | encode_ = encode_c.get(); |
1059 | | |
1060 | 1.11k | conv_ = 0; |
1061 | 1.11k | if (in.val == out.val) { |
1062 | 30 | if (in.val == "ucs-2") { |
1063 | 0 | if (in.allow_ucs) { |
1064 | 0 | conv_ = new ConvDirect<Uni16>; |
1065 | 0 | } else { |
1066 | 0 | return make_err(encoding_not_supported, in.val); |
1067 | 0 | } |
1068 | 30 | } else if (in.val == "ucs-4") { |
1069 | 0 | if (in.allow_ucs) { |
1070 | 0 | conv_ = new ConvDirect<Uni32>; |
1071 | 0 | } else { |
1072 | 0 | return make_err(encoding_not_supported, in.val); |
1073 | 0 | } |
1074 | 30 | } else { |
1075 | 30 | conv_ = new ConvDirect<char>; |
1076 | 30 | } |
1077 | 30 | } |
1078 | | |
1079 | 1.11k | if (conv_) |
1080 | 30 | RET_ON_ERR(conv_->init(decode_, encode_, c)); |
1081 | | |
1082 | 1.11k | return no_err; |
1083 | 1.11k | } |
1084 | | |
1085 | | |
1086 | | PosibErr<void> Convert::init_norm_from(const Config & c, const ConvKey & in, const ConvKey & out) |
1087 | 3.57k | { |
1088 | 3.57k | if (!c.retrieve_bool("normalize") && !c.retrieve_bool("norm-required")) |
1089 | 272 | return init(c,in,out); |
1090 | | |
1091 | 3.30k | RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, out.val)); |
1092 | | |
1093 | 3.30k | RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); |
1094 | 3.29k | decode_ = decode_c.get(); |
1095 | | |
1096 | 3.29k | if (c.retrieve_bool("norm-strict")) { |
1097 | 12 | encode_s = new EncodeNormLookup(norm_tables_->strict); |
1098 | 12 | encode_ = encode_s; |
1099 | 12 | encode_->key = out.val; |
1100 | 12 | encode_->key += ":strict"; |
1101 | 3.28k | } else { |
1102 | 3.28k | encode_s = new EncodeNormLookup(norm_tables_->internal); |
1103 | 3.28k | encode_ = encode_s; |
1104 | 3.28k | encode_->key = out.val; |
1105 | 3.28k | encode_->key += ":internal"; |
1106 | 3.28k | } |
1107 | 3.29k | conv_ = 0; |
1108 | | |
1109 | 3.29k | return no_err; |
1110 | 3.30k | } |
1111 | | |
1112 | | PosibErr<void> Convert::init_norm_to(const Config & c, const ConvKey & in, const ConvKey & out) |
1113 | 10.7k | { |
1114 | 10.7k | String norm_form = c.retrieve("norm-form"); |
1115 | 10.7k | if ((!c.retrieve_bool("normalize") || norm_form == "none") |
1116 | 808 | && !c.retrieve_bool("norm-required")) |
1117 | 808 | return init(c,in,out); |
1118 | 9.89k | if (norm_form == "none" && c.retrieve_bool("norm-required")) |
1119 | 0 | norm_form = "nfc"; |
1120 | | |
1121 | 9.89k | RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, in.val)); |
1122 | | |
1123 | 9.89k | RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); |
1124 | 9.89k | encode_ = encode_c.get(); |
1125 | | |
1126 | 9.89k | NormTables::ToUni::const_iterator i = norm_tables_->to_uni.begin(); |
1127 | 19.7k | for (; i != norm_tables_->to_uni.end() && i->name != norm_form; ++i); |
1128 | 9.89k | if (i == norm_tables_->to_uni.end()) |
1129 | 0 | return make_err(aerror_bad_value, "norm-form", norm_form, "one of none, nfd, nfc, or comp"); |
1130 | | |
1131 | 9.89k | decode_s = new DecodeNormLookup(i->ptr); |
1132 | 9.89k | decode_ = decode_s; |
1133 | 9.89k | decode_->key = in.val; |
1134 | 9.89k | decode_->key += ':'; |
1135 | 9.89k | decode_->key += i->name; |
1136 | | |
1137 | 9.89k | conv_ = 0; |
1138 | | |
1139 | 9.89k | return no_err; |
1140 | 9.89k | } |
1141 | | |
1142 | | PosibErr<void> MBLen::setup(const Config &, ParmStr enc0) |
1143 | 0 | { |
1144 | 0 | String buf; |
1145 | 0 | const char * enc = fix_encoding_str(enc0,buf); |
1146 | 0 | if (strcmp(enc, "utf-8") == 0) encoding = UTF8; |
1147 | 0 | else if (strcmp(enc, "ucs-2") == 0) encoding = UCS2; |
1148 | 0 | else if (strcmp(enc, "ucs-4") == 0) encoding = UCS4; |
1149 | 0 | else encoding = Other; |
1150 | 0 | return no_err; |
1151 | 0 | } |
1152 | | |
1153 | | unsigned MBLen::operator()(const char * str, const char * stop) |
1154 | 0 | { |
1155 | 0 | unsigned size = 0; |
1156 | 0 | switch (encoding) { |
1157 | 0 | case Other: |
1158 | 0 | return stop - str; |
1159 | 0 | case UTF8: |
1160 | 0 | for (; str != stop; ++str) { |
1161 | 0 | if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) ++size; |
1162 | 0 | } |
1163 | 0 | return size; |
1164 | 0 | case UCS2: |
1165 | 0 | return (stop - str)/2; |
1166 | 0 | case UCS4: |
1167 | 0 | return (stop - str)/4; |
1168 | 0 | } |
1169 | 0 | return 0; |
1170 | 0 | } |
1171 | | |
1172 | 0 | PosibErr<void> unsupported_null_term_wide_string_err_(const char * func) { |
1173 | 0 | static bool reported_to_stderr = false; |
1174 | 0 | PosibErr<void> err = make_err(other_error, unsupported_null_term_wide_string_msg); |
1175 | 0 | if (!reported_to_stderr) { |
1176 | 0 | CERR.printf("ERROR: %s: %s\n", func, unsupported_null_term_wide_string_msg); |
1177 | 0 | reported_to_stderr = true; |
1178 | 0 | } |
1179 | 0 | return err; |
1180 | 0 | } |
1181 | | |
1182 | 0 | void unsupported_null_term_wide_string_abort_(const char * func) { |
1183 | 0 | CERR.printf("%s: %s\n", func, unsupported_null_term_wide_string_msg); |
1184 | 0 | abort(); |
1185 | 0 | } |
1186 | | |
1187 | | } |