Coverage Report

Created: 2023-09-25 06:55

/src/mdbtools/src/libmdb/iconv.c
Line
Count
Source (jump to first uncovered line)
1
/* MDB Tools - A library for reading MS Access database files
2
 * Copyright (C) 2000 Brian Bruns
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Library General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Library General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Library General Public
15
 * License along with this library; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
17
 */
18
19
#include <errno.h>
20
#include "mdbtools.h"
21
22
#ifndef MIN
23
#define MIN(a,b) (a>b ? b : a)
24
#endif
25
26
0
static size_t decompress_unicode(const char *src, size_t slen, char *dst, size_t dlen) {
27
0
  unsigned int compress=1;
28
0
  size_t tlen = 0;
29
0
  while (slen > 0 && tlen < dlen) {
30
0
    if (*src == 0) {
31
0
      compress = (compress) ? 0 : 1;
32
0
      src++;
33
0
      slen--;
34
0
    } else if (compress) {
35
0
      dst[tlen++] = *src++;
36
0
      dst[tlen++] = 0;
37
0
      slen--;
38
0
    } else if (slen >= 2){
39
0
      dst[tlen++] = *src++;
40
0
      dst[tlen++] = *src++;
41
0
      slen-=2;
42
0
    } else { // Odd # of bytes
43
0
      break;
44
0
    }
45
0
  }
46
0
  return tlen;
47
0
}
48
49
#ifdef HAVE_ICONV
50
408
static size_t decompressed_to_utf8_with_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
51
408
  char *out_ptr = dest;
52
408
  size_t len_out = dlen - 1;
53
54
416
  while (len_out) {
55
416
    iconv(mdb->iconv_in, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out);
56
    /* 
57
     * Have seen database with odd number of bytes in UCS-2, shouldn't happen but protect against it
58
     */
59
416
    if (!IS_JET3(mdb) && len_in<=1) {
60
      //fprintf(stderr, "Detected invalid number of UCS-2 bytes\n");
61
0
      break;
62
0
    }
63
416
    if (!len_in || !len_out || errno == E2BIG) break;
64
    /* Don't bail if impossible conversion is encountered */
65
8
    in_ptr += (IS_JET3(mdb)) ? 1 : 2;
66
8
    len_in -= (IS_JET3(mdb)) ? 1 : 2;
67
8
    *out_ptr++ = '?';
68
8
    len_out--;
69
8
  }
70
408
  dlen -= len_out + 1;
71
408
  dest[dlen] = '\0';
72
408
  return dlen;
73
408
}
74
#else
75
static size_t latin1_to_utf8_without_iconv(const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
76
  char *out = dest;
77
  size_t i;
78
  for(i=0; i<len_in && out < dest + dlen - 1 - ((unsigned char)in_ptr[i] >> 7); i++) {
79
    unsigned char c = in_ptr[i];
80
    if(c & 0x80) {
81
      *out++ = 0xC0 | (c >> 6);
82
      *out++ = 0x80 | (c & 0x3F);
83
    } else {
84
      *out++ = c;
85
    }
86
  }
87
  *out = '\0';
88
  return out - dest;
89
}
90
91
static size_t unicode2ascii_locale(mdb_locale_t locale, const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
92
    size_t i;
93
    size_t count = 0;
94
    size_t len_out = dlen - 1;
95
    wchar_t *w = malloc((len_in/2+1)*sizeof(wchar_t));
96
97
    for(i=0; i<len_in/2; i++)
98
    {
99
        w[i] = (unsigned char)in_ptr[2*i] + ((unsigned char)in_ptr[2*i+1] << 8);
100
    }
101
    w[len_in/2] = '\0';
102
103
#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS)
104
    count = _wcstombs_l(dest, w, len_out, locale);
105
#elif defined(HAVE_WCSTOMBS_L)
106
    count = wcstombs_l(dest, w, len_out, locale);
107
#else
108
    locale_t oldlocale = uselocale(locale);
109
    count = wcstombs(dest, w, len_out);
110
    uselocale(oldlocale);
111
#endif
112
    free(w);
113
    if (count == (size_t)-1)
114
        return 0;
115
116
    dest[count] = '\0';
117
  return count;
118
}
119
120
static size_t decompressed_to_utf8_without_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
121
  if (IS_JET3(mdb)) {
122
    if (mdb->f->code_page == 1252) {
123
      return latin1_to_utf8_without_iconv(in_ptr, len_in, dest, dlen);
124
    }
125
    int count = 0;
126
    snprintf(dest, dlen, "%.*s%n", (int)len_in, in_ptr, &count);
127
    return count;
128
  }
129
    return unicode2ascii_locale(mdb->locale, in_ptr, len_in, dest, dlen);
130
}
131
#endif
132
133
/*
134
 * This function is used in reading text data from an MDB table.
135
 * 'dest' will receive a converted, null-terminated string.
136
 * dlen is the available size of the destination buffer.
137
 * Returns the length of the converted string, not including the terminator.
138
 */
139
int
140
mdb_unicode2ascii(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen)
141
408
{
142
408
  char *tmp = NULL;
143
408
  size_t len_in;
144
408
  const char *in_ptr = NULL;
145
146
408
  if ((!src) || (!dest) || (!dlen))
147
0
    return 0;
148
149
  /* Uncompress 'Unicode Compressed' string into tmp */
150
408
  if (!IS_JET3(mdb) && (slen>=2)
151
408
      && ((src[0]&0xff)==0xff) && ((src[1]&0xff)==0xfe)) {
152
0
    tmp = g_malloc(slen*2);
153
0
    len_in = decompress_unicode(src + 2, slen - 2, tmp, slen * 2);
154
0
    in_ptr = tmp;
155
408
  } else {
156
408
    len_in = slen;
157
408
    in_ptr = src;
158
408
  }
159
160
408
#ifdef HAVE_ICONV
161
408
  dlen = decompressed_to_utf8_with_iconv(mdb, in_ptr, len_in, dest, dlen);
162
#else
163
  dlen = decompressed_to_utf8_without_iconv(mdb, in_ptr, len_in, dest, dlen);
164
#endif
165
166
408
  if (tmp) g_free(tmp);
167
408
  return dlen;
168
408
}
169
170
/*
171
 * This function is used in writing text data to an MDB table.
172
 * If slen is 0, strlen will be used to calculate src's length.
173
 */
174
int
175
mdb_ascii2unicode(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen)
176
0
{
177
0
        size_t len_in, len_out;
178
0
        const char *in_ptr = NULL;
179
0
        char *out_ptr = NULL;
180
181
0
  if ((!src) || (!dest) || (!dlen))
182
0
    return 0;
183
184
0
        in_ptr = src;
185
0
        out_ptr = dest;
186
0
        len_in = (slen) ? slen : strlen(in_ptr);
187
0
        len_out = dlen;
188
189
0
#ifdef HAVE_ICONV
190
0
  iconv(mdb->iconv_out, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out);
191
  //printf("len_in %d len_out %d\n", len_in, len_out);
192
0
  dlen -= len_out;
193
#else
194
  if (IS_JET3(mdb)) {
195
    int count;
196
    snprintf(out_ptr, len_out, "%.*s%n", (int)len_in, in_ptr, &count);
197
    dlen = count;
198
  } else {
199
    unsigned int i;
200
    slen = MIN(len_in, len_out/2);
201
    dlen = slen*2;
202
    for (i=0; i<slen; i++) {
203
      out_ptr[i*2] = in_ptr[i];
204
      out_ptr[i*2+1] = 0;
205
    }
206
  }
207
#endif
208
209
  /* Unicode Compression */
210
0
  if(!IS_JET3(mdb) && (dlen>4)) {
211
0
    unsigned char *tmp = g_malloc(dlen);
212
0
    unsigned int tptr = 0, dptr = 0;
213
0
    int comp = 1;
214
215
0
    tmp[tptr++] = 0xff;
216
0
    tmp[tptr++] = 0xfe;
217
0
    while((dptr < dlen) && (tptr < dlen)) {
218
0
      if (((dest[dptr+1]==0) && (comp==0))
219
0
       || ((dest[dptr+1]!=0) && (comp==1))) {
220
        /* switch encoding mode */
221
0
        tmp[tptr++] = 0;
222
0
        comp = (comp) ? 0 : 1;
223
0
      } else if (dest[dptr]==0) {
224
        /* this string cannot be compressed */
225
0
        tptr = dlen;
226
0
      } else if (comp==1) {
227
        /* encode compressed character */
228
0
        tmp[tptr++] = dest[dptr];
229
0
        dptr += 2;
230
0
      } else if (tptr+1 < dlen) {
231
        /* encode uncompressed character */
232
0
        tmp[tptr++] = dest[dptr];
233
0
        tmp[tptr++] = dest[dptr+1];
234
0
        dptr += 2;
235
0
      } else {
236
        /* could not encode uncompressed character
237
         * into single byte */
238
0
        tptr = dlen;
239
0
      }
240
0
    }
241
0
    if (tptr < dlen) {
242
0
      memcpy(dest, tmp, tptr);
243
0
      dlen = tptr;
244
0
    }
245
0
    g_free(tmp);
246
0
  }
247
248
0
  return dlen;
249
0
}
250
251
const char*
252
mdb_target_charset(MdbHandle *mdb)
253
0
{
254
0
#ifdef HAVE_ICONV
255
0
  const char *iconv_code = getenv("MDBICONV");
256
0
  if (!iconv_code)
257
0
    iconv_code = "UTF-8";
258
0
  return iconv_code;
259
#else
260
  if (!IS_JET3(mdb))
261
    return "ISO-8859-1";
262
  return NULL; // same as input: unknown
263
#endif
264
0
}
265
266
/* See: https://docs.microsoft.com/en-us/windows/win32/Intl/code-page-identifiers */
267
#ifdef HAVE_ICONV
268
17
static const char *mdb_iconv_name_from_code_page(int code_page) {
269
17
  const char *jet3_iconv_code = NULL;
270
17
  switch (code_page) {
271
0
    case 437: jet3_iconv_code="IBM437"; break;
272
0
    case 850: jet3_iconv_code="IBM850"; break;
273
0
    case 852: jet3_iconv_code="IBM852"; break;
274
0
    case 855: jet3_iconv_code="IBM855"; break;
275
0
    case 860: jet3_iconv_code="IBM860"; break;
276
0
    case 861: jet3_iconv_code="IBM861"; break;
277
0
    case 862: jet3_iconv_code="IBM862"; break;
278
0
    case 865: jet3_iconv_code="IBM865"; break;
279
0
    case 866: jet3_iconv_code="IBM866"; break;
280
0
    case 869: jet3_iconv_code="IBM869"; break;
281
0
    case 874: jet3_iconv_code="WINDOWS-874"; break;
282
0
    case 932: jet3_iconv_code="SHIFT-JIS"; break;
283
0
    case 936: jet3_iconv_code="WINDOWS-936"; break;
284
0
    case 950: jet3_iconv_code="BIG-5"; break;
285
0
    case 951: jet3_iconv_code="BIG5-HKSCS"; break;
286
0
    case 1200: jet3_iconv_code="UTF-16LE"; break;
287
0
    case 1201: jet3_iconv_code="UTF-16BE"; break;
288
0
    case 1250: jet3_iconv_code="WINDOWS-1250"; break;
289
0
    case 1251: jet3_iconv_code="WINDOWS-1251"; break;
290
13
    case 1252: jet3_iconv_code="WINDOWS-1252"; break;
291
0
    case 1253: jet3_iconv_code="WINDOWS-1253"; break;
292
0
    case 1254: jet3_iconv_code="WINDOWS-1254"; break;
293
0
    case 1255: jet3_iconv_code="WINDOWS-1255"; break;
294
0
    case 1256: jet3_iconv_code="WINDOWS-1256"; break;
295
0
    case 1257: jet3_iconv_code="WINDOWS-1257"; break;
296
0
    case 1258: jet3_iconv_code="WINDOWS-1258"; break;
297
0
    case 1361: jet3_iconv_code="CP1361"; break;
298
0
    case 12000: jet3_iconv_code="UTF-32LE"; break;
299
0
    case 12001: jet3_iconv_code="UTF-32BE"; break;
300
0
    case 20866: jet3_iconv_code="KOI8-R"; break;
301
0
    case 20932: jet3_iconv_code="EUC-JP"; break;
302
0
    case 21866: jet3_iconv_code="KOI8-U"; break;
303
0
    case 28591: jet3_iconv_code="ISO-8859-1"; break;
304
0
    case 28592: jet3_iconv_code="ISO-8859-2"; break;
305
0
    case 28593: jet3_iconv_code="ISO-8859-3"; break;
306
0
    case 28594: jet3_iconv_code="ISO-8859-4"; break;
307
0
    case 28595: jet3_iconv_code="ISO-8859-5"; break;
308
0
    case 28596: jet3_iconv_code="ISO-8859-6"; break;
309
0
    case 28597: jet3_iconv_code="ISO-8859-7"; break;
310
0
    case 28598: jet3_iconv_code="ISO-8859-8"; break;
311
0
    case 28599: jet3_iconv_code="ISO-8859-9"; break;
312
0
    case 28503: jet3_iconv_code="ISO-8859-13"; break;
313
0
    case 28505: jet3_iconv_code="ISO-8859-15"; break;
314
0
    case 51932: jet3_iconv_code="EUC-JP"; break;
315
0
    case 51936: jet3_iconv_code="EUC-CN"; break;
316
0
    case 51949: jet3_iconv_code="EUC-KR"; break;
317
0
    case 65000: jet3_iconv_code="UTF-7"; break;
318
0
    case 65001: jet3_iconv_code="UTF-8"; break;
319
4
    default: break;
320
17
  }
321
17
  return jet3_iconv_code;
322
17
}
323
#endif
324
325
void mdb_iconv_init(MdbHandle *mdb)
326
17
{
327
17
  const char *iconv_code;
328
329
  /* check environment variable */
330
17
  if (!(iconv_code=getenv("MDBICONV"))) {
331
17
    iconv_code="UTF-8";
332
17
  }
333
334
17
#ifdef HAVE_ICONV
335
17
  if (!IS_JET3(mdb)) {
336
0
    mdb->iconv_out = iconv_open("UCS-2LE", iconv_code);
337
0
    mdb->iconv_in = iconv_open(iconv_code, "UCS-2LE");
338
17
  } else {
339
    /* check environment variable */
340
17
    const char *jet3_iconv_code = getenv("MDB_JET3_CHARSET");
341
342
17
    if (!jet3_iconv_code) {
343
      /* Use code page embedded in the database */
344
      /* Note that individual columns can override this value,
345
       * but per-column code pages are not supported by libmdb */
346
17
      jet3_iconv_code = mdb_iconv_name_from_code_page(mdb->f->code_page);
347
17
    }
348
17
    if (!jet3_iconv_code) {
349
4
      jet3_iconv_code = "CP1252";
350
4
    }
351
352
17
    mdb->iconv_out = iconv_open(jet3_iconv_code, iconv_code);
353
17
    mdb->iconv_in = iconv_open(iconv_code, jet3_iconv_code);
354
17
  }
355
#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS)
356
    mdb->locale = _create_locale(LC_CTYPE, ".65001");
357
#else
358
    mdb->locale = newlocale(LC_CTYPE_MASK, "C.UTF-8", NULL);
359
#endif
360
17
}
361
void mdb_iconv_close(MdbHandle *mdb)
362
19
{
363
19
#ifdef HAVE_ICONV
364
19
    if (mdb->iconv_out != (iconv_t)-1) iconv_close(mdb->iconv_out);
365
19
    if (mdb->iconv_in != (iconv_t)-1) iconv_close(mdb->iconv_in);
366
#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) || defined(WINDOWS)
367
    if (mdb->locale) _free_locale(mdb->locale);
368
#else
369
    if (mdb->locale) freelocale(mdb->locale);
370
#endif
371
19
}
372
373