/src/mdbtools/src/libmdb/iconv.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* MDB Tools - A library for reading MS Access database files |
2 | | * Copyright (C) 2000 Brian Bruns |
3 | | * |
4 | | * This library is free software; you can redistribute it and/or |
5 | | * modify it under the terms of the GNU Library General Public |
6 | | * License as published by the Free Software Foundation; either |
7 | | * version 2 of the License, or (at your option) any later version. |
8 | | * |
9 | | * This library is distributed in the hope that it will be useful, |
10 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | | * Library General Public License for more details. |
13 | | * |
14 | | * You should have received a copy of the GNU Library General Public |
15 | | * License along with this library; if not, write to the Free Software |
16 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | | */ |
18 | | |
19 | | #include <errno.h> |
20 | | #include "mdbtools.h" |
21 | | |
22 | | #ifndef MIN |
23 | | #define MIN(a,b) (a>b ? b : a) |
24 | | #endif |
25 | | |
26 | 0 | static size_t decompress_unicode(const char *src, size_t slen, char *dst, size_t dlen) { |
27 | 0 | unsigned int compress=1; |
28 | 0 | size_t tlen = 0; |
29 | 0 | while (slen > 0 && tlen < dlen) { |
30 | 0 | if (*src == 0) { |
31 | 0 | compress = (compress) ? 0 : 1; |
32 | 0 | src++; |
33 | 0 | slen--; |
34 | 0 | } else if (compress) { |
35 | 0 | dst[tlen++] = *src++; |
36 | 0 | dst[tlen++] = 0; |
37 | 0 | slen--; |
38 | 0 | } else if (slen >= 2){ |
39 | 0 | dst[tlen++] = *src++; |
40 | 0 | dst[tlen++] = *src++; |
41 | 0 | slen-=2; |
42 | 0 | } else { // Odd # of bytes |
43 | 0 | break; |
44 | 0 | } |
45 | 0 | } |
46 | 0 | return tlen; |
47 | 0 | } |
48 | | |
49 | | #ifdef HAVE_ICONV |
50 | 408 | static size_t decompressed_to_utf8_with_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { |
51 | 408 | char *out_ptr = dest; |
52 | 408 | size_t len_out = dlen - 1; |
53 | | |
54 | 416 | while (len_out) { |
55 | 416 | iconv(mdb->iconv_in, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out); |
56 | | /* |
57 | | * Have seen database with odd number of bytes in UCS-2, shouldn't happen but protect against it |
58 | | */ |
59 | 416 | if (!IS_JET3(mdb) && len_in<=1) { |
60 | | //fprintf(stderr, "Detected invalid number of UCS-2 bytes\n"); |
61 | 0 | break; |
62 | 0 | } |
63 | 416 | if (!len_in || !len_out || errno == E2BIG) break; |
64 | | /* Don't bail if impossible conversion is encountered */ |
65 | 8 | in_ptr += (IS_JET3(mdb)) ? 1 : 2; |
66 | 8 | len_in -= (IS_JET3(mdb)) ? 1 : 2; |
67 | 8 | *out_ptr++ = '?'; |
68 | 8 | len_out--; |
69 | 8 | } |
70 | 408 | dlen -= len_out + 1; |
71 | 408 | dest[dlen] = '\0'; |
72 | 408 | return dlen; |
73 | 408 | } |
74 | | #else |
75 | | static size_t latin1_to_utf8_without_iconv(const char *in_ptr, size_t len_in, char *dest, size_t dlen) { |
76 | | char *out = dest; |
77 | | size_t i; |
78 | | for(i=0; i<len_in && out < dest + dlen - 1 - ((unsigned char)in_ptr[i] >> 7); i++) { |
79 | | unsigned char c = in_ptr[i]; |
80 | | if(c & 0x80) { |
81 | | *out++ = 0xC0 | (c >> 6); |
82 | | *out++ = 0x80 | (c & 0x3F); |
83 | | } else { |
84 | | *out++ = c; |
85 | | } |
86 | | } |
87 | | *out = '\0'; |
88 | | return out - dest; |
89 | | } |
90 | | |
91 | | static size_t unicode2ascii_locale(mdb_locale_t locale, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { |
92 | | size_t i; |
93 | | size_t count = 0; |
94 | | size_t len_out = dlen - 1; |
95 | | wchar_t *w = malloc((len_in/2+1)*sizeof(wchar_t)); |
96 | | |
97 | | for(i=0; i<len_in/2; i++) |
98 | | { |
99 | | w[i] = (unsigned char)in_ptr[2*i] + ((unsigned char)in_ptr[2*i+1] << 8); |
100 | | } |
101 | | w[len_in/2] = '\0'; |
102 | | |
103 | | #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS) |
104 | | count = _wcstombs_l(dest, w, len_out, locale); |
105 | | #elif defined(HAVE_WCSTOMBS_L) |
106 | | count = wcstombs_l(dest, w, len_out, locale); |
107 | | #else |
108 | | locale_t oldlocale = uselocale(locale); |
109 | | count = wcstombs(dest, w, len_out); |
110 | | uselocale(oldlocale); |
111 | | #endif |
112 | | free(w); |
113 | | if (count == (size_t)-1) |
114 | | return 0; |
115 | | |
116 | | dest[count] = '\0'; |
117 | | return count; |
118 | | } |
119 | | |
120 | | static size_t decompressed_to_utf8_without_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { |
121 | | if (IS_JET3(mdb)) { |
122 | | if (mdb->f->code_page == 1252) { |
123 | | return latin1_to_utf8_without_iconv(in_ptr, len_in, dest, dlen); |
124 | | } |
125 | | int count = 0; |
126 | | snprintf(dest, dlen, "%.*s%n", (int)len_in, in_ptr, &count); |
127 | | return count; |
128 | | } |
129 | | return unicode2ascii_locale(mdb->locale, in_ptr, len_in, dest, dlen); |
130 | | } |
131 | | #endif |
132 | | |
133 | | /* |
134 | | * This function is used in reading text data from an MDB table. |
135 | | * 'dest' will receive a converted, null-terminated string. |
136 | | * dlen is the available size of the destination buffer. |
137 | | * Returns the length of the converted string, not including the terminator. |
138 | | */ |
139 | | int |
140 | | mdb_unicode2ascii(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen) |
141 | 408 | { |
142 | 408 | char *tmp = NULL; |
143 | 408 | size_t len_in; |
144 | 408 | const char *in_ptr = NULL; |
145 | | |
146 | 408 | if ((!src) || (!dest) || (!dlen)) |
147 | 0 | return 0; |
148 | | |
149 | | /* Uncompress 'Unicode Compressed' string into tmp */ |
150 | 408 | if (!IS_JET3(mdb) && (slen>=2) |
151 | 408 | && ((src[0]&0xff)==0xff) && ((src[1]&0xff)==0xfe)) { |
152 | 0 | tmp = g_malloc(slen*2); |
153 | 0 | len_in = decompress_unicode(src + 2, slen - 2, tmp, slen * 2); |
154 | 0 | in_ptr = tmp; |
155 | 408 | } else { |
156 | 408 | len_in = slen; |
157 | 408 | in_ptr = src; |
158 | 408 | } |
159 | | |
160 | 408 | #ifdef HAVE_ICONV |
161 | 408 | dlen = decompressed_to_utf8_with_iconv(mdb, in_ptr, len_in, dest, dlen); |
162 | | #else |
163 | | dlen = decompressed_to_utf8_without_iconv(mdb, in_ptr, len_in, dest, dlen); |
164 | | #endif |
165 | | |
166 | 408 | if (tmp) g_free(tmp); |
167 | 408 | return dlen; |
168 | 408 | } |
169 | | |
170 | | /* |
171 | | * This function is used in writing text data to an MDB table. |
172 | | * If slen is 0, strlen will be used to calculate src's length. |
173 | | */ |
174 | | int |
175 | | mdb_ascii2unicode(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen) |
176 | 0 | { |
177 | 0 | size_t len_in, len_out; |
178 | 0 | const char *in_ptr = NULL; |
179 | 0 | char *out_ptr = NULL; |
180 | |
|
181 | 0 | if ((!src) || (!dest) || (!dlen)) |
182 | 0 | return 0; |
183 | | |
184 | 0 | in_ptr = src; |
185 | 0 | out_ptr = dest; |
186 | 0 | len_in = (slen) ? slen : strlen(in_ptr); |
187 | 0 | len_out = dlen; |
188 | |
|
189 | 0 | #ifdef HAVE_ICONV |
190 | 0 | iconv(mdb->iconv_out, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out); |
191 | | //printf("len_in %d len_out %d\n", len_in, len_out); |
192 | 0 | dlen -= len_out; |
193 | | #else |
194 | | if (IS_JET3(mdb)) { |
195 | | int count; |
196 | | snprintf(out_ptr, len_out, "%.*s%n", (int)len_in, in_ptr, &count); |
197 | | dlen = count; |
198 | | } else { |
199 | | unsigned int i; |
200 | | slen = MIN(len_in, len_out/2); |
201 | | dlen = slen*2; |
202 | | for (i=0; i<slen; i++) { |
203 | | out_ptr[i*2] = in_ptr[i]; |
204 | | out_ptr[i*2+1] = 0; |
205 | | } |
206 | | } |
207 | | #endif |
208 | | |
209 | | /* Unicode Compression */ |
210 | 0 | if(!IS_JET3(mdb) && (dlen>4)) { |
211 | 0 | unsigned char *tmp = g_malloc(dlen); |
212 | 0 | unsigned int tptr = 0, dptr = 0; |
213 | 0 | int comp = 1; |
214 | |
|
215 | 0 | tmp[tptr++] = 0xff; |
216 | 0 | tmp[tptr++] = 0xfe; |
217 | 0 | while((dptr < dlen) && (tptr < dlen)) { |
218 | 0 | if (((dest[dptr+1]==0) && (comp==0)) |
219 | 0 | || ((dest[dptr+1]!=0) && (comp==1))) { |
220 | | /* switch encoding mode */ |
221 | 0 | tmp[tptr++] = 0; |
222 | 0 | comp = (comp) ? 0 : 1; |
223 | 0 | } else if (dest[dptr]==0) { |
224 | | /* this string cannot be compressed */ |
225 | 0 | tptr = dlen; |
226 | 0 | } else if (comp==1) { |
227 | | /* encode compressed character */ |
228 | 0 | tmp[tptr++] = dest[dptr]; |
229 | 0 | dptr += 2; |
230 | 0 | } else if (tptr+1 < dlen) { |
231 | | /* encode uncompressed character */ |
232 | 0 | tmp[tptr++] = dest[dptr]; |
233 | 0 | tmp[tptr++] = dest[dptr+1]; |
234 | 0 | dptr += 2; |
235 | 0 | } else { |
236 | | /* could not encode uncompressed character |
237 | | * into single byte */ |
238 | 0 | tptr = dlen; |
239 | 0 | } |
240 | 0 | } |
241 | 0 | if (tptr < dlen) { |
242 | 0 | memcpy(dest, tmp, tptr); |
243 | 0 | dlen = tptr; |
244 | 0 | } |
245 | 0 | g_free(tmp); |
246 | 0 | } |
247 | |
|
248 | 0 | return dlen; |
249 | 0 | } |
250 | | |
251 | | const char* |
252 | | mdb_target_charset(MdbHandle *mdb) |
253 | 0 | { |
254 | 0 | #ifdef HAVE_ICONV |
255 | 0 | const char *iconv_code = getenv("MDBICONV"); |
256 | 0 | if (!iconv_code) |
257 | 0 | iconv_code = "UTF-8"; |
258 | 0 | return iconv_code; |
259 | | #else |
260 | | if (!IS_JET3(mdb)) |
261 | | return "ISO-8859-1"; |
262 | | return NULL; // same as input: unknown |
263 | | #endif |
264 | 0 | } |
265 | | |
266 | | /* See: https://docs.microsoft.com/en-us/windows/win32/Intl/code-page-identifiers */ |
267 | | #ifdef HAVE_ICONV |
268 | 17 | static const char *mdb_iconv_name_from_code_page(int code_page) { |
269 | 17 | const char *jet3_iconv_code = NULL; |
270 | 17 | switch (code_page) { |
271 | 0 | case 437: jet3_iconv_code="IBM437"; break; |
272 | 0 | case 850: jet3_iconv_code="IBM850"; break; |
273 | 0 | case 852: jet3_iconv_code="IBM852"; break; |
274 | 0 | case 855: jet3_iconv_code="IBM855"; break; |
275 | 0 | case 860: jet3_iconv_code="IBM860"; break; |
276 | 0 | case 861: jet3_iconv_code="IBM861"; break; |
277 | 0 | case 862: jet3_iconv_code="IBM862"; break; |
278 | 0 | case 865: jet3_iconv_code="IBM865"; break; |
279 | 0 | case 866: jet3_iconv_code="IBM866"; break; |
280 | 0 | case 869: jet3_iconv_code="IBM869"; break; |
281 | 0 | case 874: jet3_iconv_code="WINDOWS-874"; break; |
282 | 0 | case 932: jet3_iconv_code="SHIFT-JIS"; break; |
283 | 0 | case 936: jet3_iconv_code="WINDOWS-936"; break; |
284 | 0 | case 950: jet3_iconv_code="BIG-5"; break; |
285 | 0 | case 951: jet3_iconv_code="BIG5-HKSCS"; break; |
286 | 0 | case 1200: jet3_iconv_code="UTF-16LE"; break; |
287 | 0 | case 1201: jet3_iconv_code="UTF-16BE"; break; |
288 | 0 | case 1250: jet3_iconv_code="WINDOWS-1250"; break; |
289 | 0 | case 1251: jet3_iconv_code="WINDOWS-1251"; break; |
290 | 13 | case 1252: jet3_iconv_code="WINDOWS-1252"; break; |
291 | 0 | case 1253: jet3_iconv_code="WINDOWS-1253"; break; |
292 | 0 | case 1254: jet3_iconv_code="WINDOWS-1254"; break; |
293 | 0 | case 1255: jet3_iconv_code="WINDOWS-1255"; break; |
294 | 0 | case 1256: jet3_iconv_code="WINDOWS-1256"; break; |
295 | 0 | case 1257: jet3_iconv_code="WINDOWS-1257"; break; |
296 | 0 | case 1258: jet3_iconv_code="WINDOWS-1258"; break; |
297 | 0 | case 1361: jet3_iconv_code="CP1361"; break; |
298 | 0 | case 12000: jet3_iconv_code="UTF-32LE"; break; |
299 | 0 | case 12001: jet3_iconv_code="UTF-32BE"; break; |
300 | 0 | case 20866: jet3_iconv_code="KOI8-R"; break; |
301 | 0 | case 20932: jet3_iconv_code="EUC-JP"; break; |
302 | 0 | case 21866: jet3_iconv_code="KOI8-U"; break; |
303 | 0 | case 28591: jet3_iconv_code="ISO-8859-1"; break; |
304 | 0 | case 28592: jet3_iconv_code="ISO-8859-2"; break; |
305 | 0 | case 28593: jet3_iconv_code="ISO-8859-3"; break; |
306 | 0 | case 28594: jet3_iconv_code="ISO-8859-4"; break; |
307 | 0 | case 28595: jet3_iconv_code="ISO-8859-5"; break; |
308 | 0 | case 28596: jet3_iconv_code="ISO-8859-6"; break; |
309 | 0 | case 28597: jet3_iconv_code="ISO-8859-7"; break; |
310 | 0 | case 28598: jet3_iconv_code="ISO-8859-8"; break; |
311 | 0 | case 28599: jet3_iconv_code="ISO-8859-9"; break; |
312 | 0 | case 28503: jet3_iconv_code="ISO-8859-13"; break; |
313 | 0 | case 28505: jet3_iconv_code="ISO-8859-15"; break; |
314 | 0 | case 51932: jet3_iconv_code="EUC-JP"; break; |
315 | 0 | case 51936: jet3_iconv_code="EUC-CN"; break; |
316 | 0 | case 51949: jet3_iconv_code="EUC-KR"; break; |
317 | 0 | case 65000: jet3_iconv_code="UTF-7"; break; |
318 | 0 | case 65001: jet3_iconv_code="UTF-8"; break; |
319 | 4 | default: break; |
320 | 17 | } |
321 | 17 | return jet3_iconv_code; |
322 | 17 | } |
323 | | #endif |
324 | | |
325 | | void mdb_iconv_init(MdbHandle *mdb) |
326 | 17 | { |
327 | 17 | const char *iconv_code; |
328 | | |
329 | | /* check environment variable */ |
330 | 17 | if (!(iconv_code=getenv("MDBICONV"))) { |
331 | 17 | iconv_code="UTF-8"; |
332 | 17 | } |
333 | | |
334 | 17 | #ifdef HAVE_ICONV |
335 | 17 | if (!IS_JET3(mdb)) { |
336 | 0 | mdb->iconv_out = iconv_open("UCS-2LE", iconv_code); |
337 | 0 | mdb->iconv_in = iconv_open(iconv_code, "UCS-2LE"); |
338 | 17 | } else { |
339 | | /* check environment variable */ |
340 | 17 | const char *jet3_iconv_code = getenv("MDB_JET3_CHARSET"); |
341 | | |
342 | 17 | if (!jet3_iconv_code) { |
343 | | /* Use code page embedded in the database */ |
344 | | /* Note that individual columns can override this value, |
345 | | * but per-column code pages are not supported by libmdb */ |
346 | 17 | jet3_iconv_code = mdb_iconv_name_from_code_page(mdb->f->code_page); |
347 | 17 | } |
348 | 17 | if (!jet3_iconv_code) { |
349 | 4 | jet3_iconv_code = "CP1252"; |
350 | 4 | } |
351 | | |
352 | 17 | mdb->iconv_out = iconv_open(jet3_iconv_code, iconv_code); |
353 | 17 | mdb->iconv_in = iconv_open(iconv_code, jet3_iconv_code); |
354 | 17 | } |
355 | | #elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS) |
356 | | mdb->locale = _create_locale(LC_CTYPE, ".65001"); |
357 | | #else |
358 | | mdb->locale = newlocale(LC_CTYPE_MASK, "C.UTF-8", NULL); |
359 | | #endif |
360 | 17 | } |
361 | | void mdb_iconv_close(MdbHandle *mdb) |
362 | 19 | { |
363 | 19 | #ifdef HAVE_ICONV |
364 | 19 | if (mdb->iconv_out != (iconv_t)-1) iconv_close(mdb->iconv_out); |
365 | 19 | if (mdb->iconv_in != (iconv_t)-1) iconv_close(mdb->iconv_in); |
366 | | #elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS) |
367 | | if (mdb->locale) _free_locale(mdb->locale); |
368 | | #else |
369 | | if (mdb->locale) freelocale(mdb->locale); |
370 | | #endif |
371 | 19 | } |
372 | | |
373 | | |