Coverage Report

Created: 2026-02-05 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/striconveha.c
Line
Count
Source
1
/* Character set conversion with error handling and autodetection.
2
   Copyright (C) 2002, 2005, 2007, 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveha.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#include "malloca.h"
28
#include "c-strcase.h"
29
#include "striconveh.h"
30
31
#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
32
33
34
/* Autodetection list.  */
35
36
struct autodetect_alias
37
{
38
  struct autodetect_alias *next;
39
  const char *name;
40
  const char * const *encodings_to_try;
41
};
42
43
static const char * const autodetect_utf8_try[] =
44
{
45
  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46
     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
47
  "UTF-8", "ISO-8859-1",
48
  NULL
49
};
50
static const char * const autodetect_jp_try[] =
51
{
52
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
53
     it will fail.
54
     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55
     is unavoidable. People will condemn SHIFT_JIS.
56
     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57
     come out wrong, and people would condemn EUC-JP and Unix, which
58
     would not be good.
59
     Finally try SHIFT_JIS.  */
60
  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
61
  NULL
62
};
63
static const char * const autodetect_kr_try[] =
64
{
65
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
66
     it will fail.
67
     Finally try EUC-KR.  */
68
  "ISO-2022-KR", "EUC-KR",
69
  NULL
70
};
71
72
static struct autodetect_alias autodetect_predefined[] =
73
{
74
  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
75
  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
76
  { NULL,                      "autodetect_kr",   autodetect_kr_try }
77
};
78
79
static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
80
static struct autodetect_alias **autodetect_list_end =
81
  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
82
83
int
84
uniconv_register_autodetect (const char *name,
85
                             const char * const *try_in_order)
86
0
{
87
  /* The TRY_IN_ORDER list must not be empty.  */
88
0
  if (try_in_order[0] == NULL)
89
0
    {
90
0
      errno = EINVAL;
91
0
      return -1;
92
0
    }
93
94
  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
95
     with dynamic extent.  */
96
0
  size_t namelen = strlen (name) + 1;
97
0
  size_t memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
98
0
  size_t listlen;
99
0
  {
100
0
    size_t i;
101
0
    for (i = 0; try_in_order[i] != NULL; i++)
102
0
      memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
103
0
    listlen = i;
104
0
  }
105
106
0
  void *memory = malloc (memneed);
107
0
  if (memory != NULL)
108
0
    {
109
0
      struct autodetect_alias *new_alias = memory;
110
0
      memory = new_alias + 1;
111
112
0
      char const **new_try_in_order = memory;
113
0
      memory = new_try_in_order + listlen + 1;
114
115
0
      char *new_name = memcpy (memory, name, namelen);
116
0
      memory = new_name + namelen;
117
118
0
      {
119
0
        size_t i;
120
0
        for (i = 0; i < listlen; i++)
121
0
          {
122
0
            size_t len = strlen (try_in_order[i]) + 1;
123
0
            char *copy = memcpy (memory, try_in_order[i], len);
124
0
            new_try_in_order[i] = copy;
125
0
            memory = copy + len;
126
0
          }
127
0
        new_try_in_order[i] = NULL;
128
0
      }
129
130
      /* Now insert the new alias.  */
131
0
      new_alias->name = new_name;
132
0
      new_alias->encodings_to_try = new_try_in_order;
133
0
      new_alias->next = NULL;
134
      /* FIXME: Not multithread-safe.  */
135
0
      *autodetect_list_end = new_alias;
136
0
      autodetect_list_end = &new_alias->next;
137
0
      return 0;
138
0
    }
139
0
  else
140
0
    {
141
0
      errno = ENOMEM;
142
0
      return -1;
143
0
    }
144
0
}
145
146
/* Like mem_iconveha, except no handling of transliteration.  */
147
static int
148
mem_iconveha_notranslit (const char *src, size_t srclen,
149
                         const char *from_codeset, const char *to_codeset,
150
                         enum iconv_ilseq_handler handler,
151
                         size_t *offsets,
152
                         char **resultp, size_t *lengthp)
153
0
{
154
0
  int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
155
0
                            offsets, resultp, lengthp);
156
0
  if (retval >= 0 || errno != EINVAL)
157
0
    return retval;
158
0
  else
159
0
    {
160
      /* Unsupported from_codeset or to_codeset. Check whether the caller
161
         requested autodetection.  */
162
0
      for (struct autodetect_alias *alias = autodetect_list;
163
0
           alias != NULL;
164
0
           alias = alias->next)
165
0
        if (streq (from_codeset, alias->name))
166
0
          {
167
0
            if (handler != iconveh_error)
168
0
              {
169
                /* First try all encodings without any forgiving.  */
170
0
                const char * const *encodings = alias->encodings_to_try;
171
0
                do
172
0
                  {
173
0
                    retval = mem_iconveha_notranslit (src, srclen,
174
0
                                                      *encodings, to_codeset,
175
0
                                                      iconveh_error, offsets,
176
0
                                                      resultp, lengthp);
177
0
                    if (!(retval < 0 && errno == EILSEQ))
178
0
                      return retval;
179
0
                    encodings++;
180
0
                  }
181
0
                while (*encodings != NULL);
182
0
              }
183
184
0
            const char * const *encodings = alias->encodings_to_try;
185
0
            do
186
0
              {
187
0
                retval = mem_iconveha_notranslit (src, srclen,
188
0
                                                  *encodings, to_codeset,
189
0
                                                  handler, offsets,
190
0
                                                  resultp, lengthp);
191
0
                if (!(retval < 0 && errno == EILSEQ))
192
0
                  return retval;
193
0
                encodings++;
194
0
              }
195
0
            while (*encodings != NULL);
196
197
            /* Return the last call's result.  */
198
0
            return -1;
199
0
          }
200
201
      /* It wasn't an autodetection name.  */
202
0
      errno = EINVAL;
203
0
      return -1;
204
0
    }
205
0
}
206
207
int
208
mem_iconveha (const char *src, size_t srclen,
209
              const char *from_codeset, const char *to_codeset,
210
              bool transliterate,
211
              enum iconv_ilseq_handler handler,
212
              size_t *offsets,
213
              char **resultp, size_t *lengthp)
214
0
{
215
0
  if (srclen == 0)
216
0
    {
217
      /* Nothing to convert.  */
218
0
      *lengthp = 0;
219
0
      return 0;
220
0
    }
221
222
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
223
     iconv, we want to use transliteration.  */
224
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
225
0
     && !defined __UCLIBC__) \
226
0
    || _LIBICONV_VERSION >= 0x0105 \
227
0
    || defined ICONV_SET_TRANSLITERATE
228
0
  if (transliterate)
229
0
    {
230
0
      size_t len = strlen (to_codeset);
231
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
232
0
      if (to_codeset_suffixed == NULL)
233
0
        {
234
0
          errno = ENOMEM;
235
0
          return -1;
236
0
        }
237
0
      memcpy (to_codeset_suffixed, to_codeset, len);
238
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
239
240
0
      int retval = mem_iconveha_notranslit (src, srclen,
241
0
                                            from_codeset, to_codeset_suffixed,
242
0
                                            handler, offsets, resultp, lengthp);
243
244
0
      freea (to_codeset_suffixed);
245
246
0
      return retval;
247
0
    }
248
0
  else
249
0
#endif
250
0
    return mem_iconveha_notranslit (src, srclen,
251
0
                                    from_codeset, to_codeset,
252
0
                                    handler, offsets, resultp, lengthp);
253
0
}
254
255
/* Like str_iconveha, except no handling of transliteration.  */
256
static char *
257
str_iconveha_notranslit (const char *src,
258
                         const char *from_codeset, const char *to_codeset,
259
                         enum iconv_ilseq_handler handler)
260
0
{
261
0
  char *result = str_iconveh (src, from_codeset, to_codeset, handler);
262
263
0
  if (result != NULL || errno != EINVAL)
264
0
    return result;
265
0
  else
266
0
    {
267
      /* Unsupported from_codeset or to_codeset. Check whether the caller
268
         requested autodetection.  */
269
0
      for (struct autodetect_alias *alias = autodetect_list;
270
0
           alias != NULL;
271
0
           alias = alias->next)
272
0
        if (streq (from_codeset, alias->name))
273
0
          {
274
0
            if (handler != iconveh_error)
275
0
              {
276
                /* First try all encodings without any forgiving.  */
277
0
                const char * const *encodings = alias->encodings_to_try;
278
0
                do
279
0
                  {
280
0
                    result = str_iconveha_notranslit (src,
281
0
                                                      *encodings, to_codeset,
282
0
                                                      iconveh_error);
283
0
                    if (!(result == NULL && errno == EILSEQ))
284
0
                      return result;
285
0
                    encodings++;
286
0
                  }
287
0
                while (*encodings != NULL);
288
0
              }
289
290
0
            const char * const *encodings = alias->encodings_to_try;
291
0
            do
292
0
              {
293
0
                result = str_iconveha_notranslit (src,
294
0
                                                  *encodings, to_codeset,
295
0
                                                  handler);
296
0
                if (!(result == NULL && errno == EILSEQ))
297
0
                  return result;
298
0
                encodings++;
299
0
              }
300
0
            while (*encodings != NULL);
301
302
            /* Return the last call's result.  */
303
0
            return NULL;
304
0
          }
305
306
      /* It wasn't an autodetection name.  */
307
0
      errno = EINVAL;
308
0
      return NULL;
309
0
    }
310
0
}
311
312
char *
313
str_iconveha (const char *src,
314
              const char *from_codeset, const char *to_codeset,
315
              bool transliterate,
316
              enum iconv_ilseq_handler handler)
317
0
{
318
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
319
0
    {
320
0
      char *result = strdup (src);
321
322
0
      if (result == NULL)
323
0
        errno = ENOMEM;
324
0
      return result;
325
0
    }
326
327
  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
328
     iconv, we want to use transliteration.  */
329
0
#if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
330
0
     && !defined __UCLIBC__) \
331
0
    || _LIBICONV_VERSION >= 0x0105 \
332
0
    || defined ICONV_SET_TRANSLITERATE
333
0
  if (transliterate)
334
0
    {
335
0
      size_t len = strlen (to_codeset);
336
0
      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
337
0
      if (to_codeset_suffixed == NULL)
338
0
        {
339
0
          errno = ENOMEM;
340
0
          return NULL;
341
0
        }
342
0
      memcpy (to_codeset_suffixed, to_codeset, len);
343
0
      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
344
345
0
      char *result =
346
0
        str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
347
0
                                 handler);
348
349
0
      freea (to_codeset_suffixed);
350
351
0
      return result;
352
0
    }
353
0
  else
354
0
#endif
355
0
    return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
356
0
}