Coverage Report

Created: 2023-03-26 07:33

/src/libunistring/lib/striconveha.c
Line
Count
Source (jump to first uncovered line)
1
/* Character set conversion with error handling and autodetection.
2
   Copyright (C) 2002, 2005, 2007, 2009-2022 Free Software Foundation, Inc.
3
   Written by Bruno Haible.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveha.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#include "malloca.h"
28
#include "c-strcase.h"
29
#include "striconveh.h"
30
31
#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
32
33
34
/* Autodetection list.  */
35
36
struct autodetect_alias
37
{
38
  struct autodetect_alias *next;
39
  const char *name;
40
  const char * const *encodings_to_try;
41
};
42
43
static const char * const autodetect_utf8_try[] =
44
{
45
  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46
     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
47
  "UTF-8", "ISO-8859-1",
48
  NULL
49
};
50
static const char * const autodetect_jp_try[] =
51
{
52
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
53
     it will fail.
54
     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55
     is unavoidable. People will condemn SHIFT_JIS.
56
     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57
     come out wrong, and people would condemn EUC-JP and Unix, which
58
     would not be good.
59
     Finally try SHIFT_JIS.  */
60
  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
61
  NULL
62
};
63
static const char * const autodetect_kr_try[] =
64
{
65
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
66
     it will fail.
67
     Finally try EUC-KR.  */
68
  "ISO-2022-KR", "EUC-KR",
69
  NULL
70
};
71
72
static struct autodetect_alias autodetect_predefined[] =
73
{
74
  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
75
  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
76
  { NULL,                      "autodetect_kr",   autodetect_kr_try }
77
};
78
79
static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
80
static struct autodetect_alias **autodetect_list_end =
81
  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
82
83
int
84
uniconv_register_autodetect (const char *name,
85
                             const char * const *try_in_order)
86
0
{
87
0
  size_t namelen;
88
0
  size_t listlen;
89
0
  size_t memneed;
90
0
  size_t i;
91
0
  char *memory;
92
0
  struct autodetect_alias *new_alias;
93
0
  char *new_name;
94
0
  const char **new_try_in_order;
95
96
  /* The TRY_IN_ORDER list must not be empty.  */
97
0
  if (try_in_order[0] == NULL)
98
0
    {
99
0
      errno = EINVAL;
100
0
      return -1;
101
0
    }
102
103
  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
104
     with dynamic extent.  */
105
0
  namelen = strlen (name) + 1;
106
0
  memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
107
0
  for (i = 0; try_in_order[i] != NULL; i++)
108
0
    memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
109
0
  listlen = i;
110
111
0
  memory = (char *) malloc (memneed);
112
0
  if (memory != NULL)
113
0
    {
114
0
      new_alias = (struct autodetect_alias *) memory;
115
0
      memory += sizeof (struct autodetect_alias);
116
117
0
      new_try_in_order = (const char **) memory;
118
0
      memory += (listlen + 1) * sizeof (char *);
119
120
0
      new_name = (char *) memory;
121
0
      memcpy (new_name, name, namelen);
122
0
      memory += namelen;
123
124
0
      for (i = 0; i < listlen; i++)
125
0
        {
126
0
          size_t len = strlen (try_in_order[i]) + 1;
127
0
          memcpy (memory, try_in_order[i], len);
128
0
          new_try_in_order[i] = (const char *) memory;
129
0
          memory += len;
130
0
        }
131
0
      new_try_in_order[i] = NULL;
132
133
      /* Now insert the new alias.  */
134
0
      new_alias->name = new_name;
135
0
      new_alias->encodings_to_try = new_try_in_order;
136
0
      new_alias->next = NULL;
137
      /* FIXME: Not multithread-safe.  */
138
0
      *autodetect_list_end = new_alias;
139
0
      autodetect_list_end = &new_alias->next;
140
0
      return 0;
141
0
    }
142
0
  else
143
0
    {
144
0
      errno = ENOMEM;
145
0
      return -1;
146
0
    }
147
0
}
148
149
/* Like mem_iconveha, except no handling of transliteration.  */
150
static int
151
mem_iconveha_notranslit (const char *src, size_t srclen,
152
                         const char *from_codeset, const char *to_codeset,
153
                         enum iconv_ilseq_handler handler,
154
                         size_t *offsets,
155
                         char **resultp, size_t *lengthp)
156
0
{
157
0
  int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
158
0
                            offsets, resultp, lengthp);
159
0
  if (retval >= 0 || errno != EINVAL)
160
0
    return retval;
161
0
  else
162
0
    {
163
0
      struct autodetect_alias *alias;
164
165
      /* Unsupported from_codeset or to_codeset. Check whether the caller
166
         requested autodetection.  */
167
0
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
168
0
        if (strcmp (from_codeset, alias->name) == 0)
169
0
          {
170
0
            const char * const *encodings;
171
172
0
            if (handler != iconveh_error)
173
0
              {
174
                /* First try all encodings without any forgiving.  */
175
0
                encodings = alias->encodings_to_try;
176
0
                do
177
0
                  {
178
0
                    retval = mem_iconveha_notranslit (src, srclen,
179
0
                                                      *encodings, to_codeset,
180
0
                                                      iconveh_error, offsets,
181
0
                                                      resultp, lengthp);
182
0
                    if (!(retval < 0 && errno == EILSEQ))
183
0
                      return retval;
184
0
                    encodings++;
185
0
                  }
186
0
                while (*encodings != NULL);
187
0
              }
188
189
0
            encodings = alias->encodings_to_try;
190
0
            do
191
0
              {
192
0
                retval = mem_iconveha_notranslit (src, srclen,
193
0
                                                  *encodings, to_codeset,
194
0
                                                  handler, offsets,
195
0
                                                  resultp, lengthp);
196
0
                if (!(retval < 0 && errno == EILSEQ))
197
0
                  return retval;
198
0
                encodings++;
199
0
              }
200
0
            while (*encodings != NULL);
201
202
            /* Return the last call's result.  */
203
0
            return -1;
204
0
          }
205
206
      /* It wasn't an autodetection name.  */
207
0
      errno = EINVAL;
208
0
      return -1;
209
0
    }
210
0
}
211
212
int
213
mem_iconveha (const char *src, size_t srclen,
214
              const char *from_codeset, const char *to_codeset,
215
              bool transliterate,
216
              enum iconv_ilseq_handler handler,
217
              size_t *offsets,
218
              char **resultp, size_t *lengthp)
219
0
{
220
0
  if (srclen == 0)
221
0
    {
222
      /* Nothing to convert.  */
223
0
      *lengthp = 0;
224
0
      return 0;
225
0
    }
226
227
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
228
     we want to use transliteration.  */
229
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
230
0
     && !defined __UCLIBC__) \
231
0
    || _LIBICONV_VERSION >= 0x0105
232
0
  if (transliterate)
233
0
    {
234
0
      int retval;
235
0
      size_t len = strlen (to_codeset);
236
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
237
0
      memcpy (to_codeset_suffixed, to_codeset, len);
238
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
239
240
0
      retval = mem_iconveha_notranslit (src, srclen,
241
0
                                        from_codeset, to_codeset_suffixed,
242
0
                                        handler, offsets, resultp, lengthp);
243
244
0
      freea (to_codeset_suffixed);
245
246
0
      return retval;
247
0
    }
248
0
  else
249
0
#endif
250
0
    return mem_iconveha_notranslit (src, srclen,
251
0
                                    from_codeset, to_codeset,
252
0
                                    handler, offsets, resultp, lengthp);
253
0
}
254
255
/* Like str_iconveha, except no handling of transliteration.  */
256
static char *
257
str_iconveha_notranslit (const char *src,
258
                         const char *from_codeset, const char *to_codeset,
259
                         enum iconv_ilseq_handler handler)
260
0
{
261
0
  char *result = str_iconveh (src, from_codeset, to_codeset, handler);
262
263
0
  if (result != NULL || errno != EINVAL)
264
0
    return result;
265
0
  else
266
0
    {
267
0
      struct autodetect_alias *alias;
268
269
      /* Unsupported from_codeset or to_codeset. Check whether the caller
270
         requested autodetection.  */
271
0
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
272
0
        if (strcmp (from_codeset, alias->name) == 0)
273
0
          {
274
0
            const char * const *encodings;
275
276
0
            if (handler != iconveh_error)
277
0
              {
278
                /* First try all encodings without any forgiving.  */
279
0
                encodings = alias->encodings_to_try;
280
0
                do
281
0
                  {
282
0
                    result = str_iconveha_notranslit (src,
283
0
                                                      *encodings, to_codeset,
284
0
                                                      iconveh_error);
285
0
                    if (!(result == NULL && errno == EILSEQ))
286
0
                      return result;
287
0
                    encodings++;
288
0
                  }
289
0
                while (*encodings != NULL);
290
0
              }
291
292
0
            encodings = alias->encodings_to_try;
293
0
            do
294
0
              {
295
0
                result = str_iconveha_notranslit (src,
296
0
                                                  *encodings, to_codeset,
297
0
                                                  handler);
298
0
                if (!(result == NULL && errno == EILSEQ))
299
0
                  return result;
300
0
                encodings++;
301
0
              }
302
0
            while (*encodings != NULL);
303
304
            /* Return the last call's result.  */
305
0
            return NULL;
306
0
          }
307
308
      /* It wasn't an autodetection name.  */
309
0
      errno = EINVAL;
310
0
      return NULL;
311
0
    }
312
0
}
313
314
char *
315
str_iconveha (const char *src,
316
              const char *from_codeset, const char *to_codeset,
317
              bool transliterate,
318
              enum iconv_ilseq_handler handler)
319
0
{
320
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
321
0
    {
322
0
      char *result = strdup (src);
323
324
0
      if (result == NULL)
325
0
        errno = ENOMEM;
326
0
      return result;
327
0
    }
328
329
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
330
     we want to use transliteration.  */
331
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
332
0
     && !defined __UCLIBC__) \
333
0
    || _LIBICONV_VERSION >= 0x0105
334
0
  if (transliterate)
335
0
    {
336
0
      char *result;
337
0
      size_t len = strlen (to_codeset);
338
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
339
0
      memcpy (to_codeset_suffixed, to_codeset, len);
340
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
341
342
0
      result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
343
0
                                        handler);
344
345
0
      freea (to_codeset_suffixed);
346
347
0
      return result;
348
0
    }
349
0
  else
350
0
#endif
351
0
    return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
352
0
}