Coverage Report

Created: 2025-03-18 06:55

/src/libunistring/lib/striconveha.c
Line
Count
Source (jump to first uncovered line)
1
/* Character set conversion with error handling and autodetection.
2
   Copyright (C) 2002, 2005, 2007, 2009-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveha.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#include "malloca.h"
28
#include "c-strcase.h"
29
#include "striconveh.h"
30
31
#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
32
33
34
/* Autodetection list.  */
35
36
struct autodetect_alias
37
{
38
  struct autodetect_alias *next;
39
  const char *name;
40
  const char * const *encodings_to_try;
41
};
42
43
static const char * const autodetect_utf8_try[] =
44
{
45
  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46
     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
47
  "UTF-8", "ISO-8859-1",
48
  NULL
49
};
50
static const char * const autodetect_jp_try[] =
51
{
52
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
53
     it will fail.
54
     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55
     is unavoidable. People will condemn SHIFT_JIS.
56
     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57
     come out wrong, and people would condemn EUC-JP and Unix, which
58
     would not be good.
59
     Finally try SHIFT_JIS.  */
60
  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
61
  NULL
62
};
63
static const char * const autodetect_kr_try[] =
64
{
65
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
66
     it will fail.
67
     Finally try EUC-KR.  */
68
  "ISO-2022-KR", "EUC-KR",
69
  NULL
70
};
71
72
static struct autodetect_alias autodetect_predefined[] =
73
{
74
  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
75
  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
76
  { NULL,                      "autodetect_kr",   autodetect_kr_try }
77
};
78
79
static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
80
static struct autodetect_alias **autodetect_list_end =
81
  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
82
83
int
84
uniconv_register_autodetect (const char *name,
85
                             const char * const *try_in_order)
86
0
{
87
0
  size_t namelen;
88
0
  size_t listlen;
89
0
  size_t memneed;
90
0
  size_t i;
91
92
  /* The TRY_IN_ORDER list must not be empty.  */
93
0
  if (try_in_order[0] == NULL)
94
0
    {
95
0
      errno = EINVAL;
96
0
      return -1;
97
0
    }
98
99
  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
100
     with dynamic extent.  */
101
0
  namelen = strlen (name) + 1;
102
0
  memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
103
0
  for (i = 0; try_in_order[i] != NULL; i++)
104
0
    memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
105
0
  listlen = i;
106
107
0
  void *memory = malloc (memneed);
108
0
  if (memory != NULL)
109
0
    {
110
0
      struct autodetect_alias *new_alias = memory;
111
0
      memory = new_alias + 1;
112
113
0
      char const **new_try_in_order = memory;
114
0
      memory = new_try_in_order + listlen + 1;
115
116
0
      char *new_name = memcpy (memory, name, namelen);
117
0
      memory = new_name + namelen;
118
119
0
      for (i = 0; i < listlen; i++)
120
0
        {
121
0
          size_t len = strlen (try_in_order[i]) + 1;
122
0
          char *copy = memcpy (memory, try_in_order[i], len);
123
0
          new_try_in_order[i] = copy;
124
0
          memory = copy + len;
125
0
        }
126
0
      new_try_in_order[i] = NULL;
127
128
      /* Now insert the new alias.  */
129
0
      new_alias->name = new_name;
130
0
      new_alias->encodings_to_try = new_try_in_order;
131
0
      new_alias->next = NULL;
132
      /* FIXME: Not multithread-safe.  */
133
0
      *autodetect_list_end = new_alias;
134
0
      autodetect_list_end = &new_alias->next;
135
0
      return 0;
136
0
    }
137
0
  else
138
0
    {
139
0
      errno = ENOMEM;
140
0
      return -1;
141
0
    }
142
0
}
143
144
/* Like mem_iconveha, except no handling of transliteration.  */
145
static int
146
mem_iconveha_notranslit (const char *src, size_t srclen,
147
                         const char *from_codeset, const char *to_codeset,
148
                         enum iconv_ilseq_handler handler,
149
                         size_t *offsets,
150
                         char **resultp, size_t *lengthp)
151
0
{
152
0
  int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
153
0
                            offsets, resultp, lengthp);
154
0
  if (retval >= 0 || errno != EINVAL)
155
0
    return retval;
156
0
  else
157
0
    {
158
0
      struct autodetect_alias *alias;
159
160
      /* Unsupported from_codeset or to_codeset. Check whether the caller
161
         requested autodetection.  */
162
0
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
163
0
        if (strcmp (from_codeset, alias->name) == 0)
164
0
          {
165
0
            const char * const *encodings;
166
167
0
            if (handler != iconveh_error)
168
0
              {
169
                /* First try all encodings without any forgiving.  */
170
0
                encodings = alias->encodings_to_try;
171
0
                do
172
0
                  {
173
0
                    retval = mem_iconveha_notranslit (src, srclen,
174
0
                                                      *encodings, to_codeset,
175
0
                                                      iconveh_error, offsets,
176
0
                                                      resultp, lengthp);
177
0
                    if (!(retval < 0 && errno == EILSEQ))
178
0
                      return retval;
179
0
                    encodings++;
180
0
                  }
181
0
                while (*encodings != NULL);
182
0
              }
183
184
0
            encodings = alias->encodings_to_try;
185
0
            do
186
0
              {
187
0
                retval = mem_iconveha_notranslit (src, srclen,
188
0
                                                  *encodings, to_codeset,
189
0
                                                  handler, offsets,
190
0
                                                  resultp, lengthp);
191
0
                if (!(retval < 0 && errno == EILSEQ))
192
0
                  return retval;
193
0
                encodings++;
194
0
              }
195
0
            while (*encodings != NULL);
196
197
            /* Return the last call's result.  */
198
0
            return -1;
199
0
          }
200
201
      /* It wasn't an autodetection name.  */
202
0
      errno = EINVAL;
203
0
      return -1;
204
0
    }
205
0
}
206
207
int
208
mem_iconveha (const char *src, size_t srclen,
209
              const char *from_codeset, const char *to_codeset,
210
              bool transliterate,
211
              enum iconv_ilseq_handler handler,
212
              size_t *offsets,
213
              char **resultp, size_t *lengthp)
214
0
{
215
0
  if (srclen == 0)
216
0
    {
217
      /* Nothing to convert.  */
218
0
      *lengthp = 0;
219
0
      return 0;
220
0
    }
221
222
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
223
     iconv, we want to use transliteration.  */
224
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
225
0
     && !defined __UCLIBC__) \
226
0
    || _LIBICONV_VERSION >= 0x0105 \
227
0
    || defined ICONV_SET_TRANSLITERATE
228
0
  if (transliterate)
229
0
    {
230
0
      int retval;
231
0
      size_t len = strlen (to_codeset);
232
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
233
0
      if (to_codeset_suffixed == NULL)
234
0
        {
235
0
          errno = ENOMEM;
236
0
          return -1;
237
0
        }
238
0
      memcpy (to_codeset_suffixed, to_codeset, len);
239
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
240
241
0
      retval = mem_iconveha_notranslit (src, srclen,
242
0
                                        from_codeset, to_codeset_suffixed,
243
0
                                        handler, offsets, resultp, lengthp);
244
245
0
      freea (to_codeset_suffixed);
246
247
0
      return retval;
248
0
    }
249
0
  else
250
0
#endif
251
0
    return mem_iconveha_notranslit (src, srclen,
252
0
                                    from_codeset, to_codeset,
253
0
                                    handler, offsets, resultp, lengthp);
254
0
}
255
256
/* Like str_iconveha, except no handling of transliteration.  */
257
static char *
258
str_iconveha_notranslit (const char *src,
259
                         const char *from_codeset, const char *to_codeset,
260
                         enum iconv_ilseq_handler handler)
261
0
{
262
0
  char *result = str_iconveh (src, from_codeset, to_codeset, handler);
263
264
0
  if (result != NULL || errno != EINVAL)
265
0
    return result;
266
0
  else
267
0
    {
268
0
      struct autodetect_alias *alias;
269
270
      /* Unsupported from_codeset or to_codeset. Check whether the caller
271
         requested autodetection.  */
272
0
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
273
0
        if (strcmp (from_codeset, alias->name) == 0)
274
0
          {
275
0
            const char * const *encodings;
276
277
0
            if (handler != iconveh_error)
278
0
              {
279
                /* First try all encodings without any forgiving.  */
280
0
                encodings = alias->encodings_to_try;
281
0
                do
282
0
                  {
283
0
                    result = str_iconveha_notranslit (src,
284
0
                                                      *encodings, to_codeset,
285
0
                                                      iconveh_error);
286
0
                    if (!(result == NULL && errno == EILSEQ))
287
0
                      return result;
288
0
                    encodings++;
289
0
                  }
290
0
                while (*encodings != NULL);
291
0
              }
292
293
0
            encodings = alias->encodings_to_try;
294
0
            do
295
0
              {
296
0
                result = str_iconveha_notranslit (src,
297
0
                                                  *encodings, to_codeset,
298
0
                                                  handler);
299
0
                if (!(result == NULL && errno == EILSEQ))
300
0
                  return result;
301
0
                encodings++;
302
0
              }
303
0
            while (*encodings != NULL);
304
305
            /* Return the last call's result.  */
306
0
            return NULL;
307
0
          }
308
309
      /* It wasn't an autodetection name.  */
310
0
      errno = EINVAL;
311
0
      return NULL;
312
0
    }
313
0
}
314
315
char *
316
str_iconveha (const char *src,
317
              const char *from_codeset, const char *to_codeset,
318
              bool transliterate,
319
              enum iconv_ilseq_handler handler)
320
0
{
321
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
322
0
    {
323
0
      char *result = strdup (src);
324
325
0
      if (result == NULL)
326
0
        errno = ENOMEM;
327
0
      return result;
328
0
    }
329
330
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
331
     iconv, we want to use transliteration.  */
332
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
333
0
     && !defined __UCLIBC__) \
334
0
    || _LIBICONV_VERSION >= 0x0105 \
335
0
    || defined ICONV_SET_TRANSLITERATE
336
0
  if (transliterate)
337
0
    {
338
0
      char *result;
339
0
      size_t len = strlen (to_codeset);
340
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
341
0
      if (to_codeset_suffixed == NULL)
342
0
        {
343
0
          errno = ENOMEM;
344
0
          return NULL;
345
0
        }
346
0
      memcpy (to_codeset_suffixed, to_codeset, len);
347
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
348
349
0
      result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
350
0
                                        handler);
351
352
0
      freea (to_codeset_suffixed);
353
354
0
      return result;
355
0
    }
356
0
  else
357
0
#endif
358
0
    return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
359
0
}