Coverage Report

Created: 2026-02-05 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/striconveh.c
Line
Count
Source
1
/* Character set conversion with error handling.
2
   Copyright (C) 2001-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible and Simon Josefsson.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveh.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#if HAVE_ICONV
28
# include <iconv.h>
29
# include "unistr.h"
30
#endif
31
32
#include "c-strcase.h"
33
#include "c-strcaseeq.h"
34
35
#ifndef SIZE_MAX
36
# define SIZE_MAX ((size_t) -1)
37
#endif
38
39
40
#if HAVE_ICONV
41
42
/* The caller must provide an iconveh_t, not just an iconv_t, because when a
43
   conversion error occurs, we may have to determine the Unicode representation
44
   of the inconvertible character.  */
45
46
int
47
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
48
0
{
49
0
  iconv_t cd = iconv_open (to_codeset, from_codeset);
50
51
0
  iconv_t cd1;
52
0
  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
53
0
    cd1 = (iconv_t)(-1);
54
0
  else
55
0
    {
56
0
      cd1 = iconv_open ("UTF-8", from_codeset);
57
0
      if (cd1 == (iconv_t)(-1))
58
0
        {
59
0
          int saved_errno = errno;
60
0
          if (cd != (iconv_t)(-1))
61
0
            iconv_close (cd);
62
0
          errno = saved_errno;
63
0
          return -1;
64
0
        }
65
0
    }
66
67
0
  iconv_t cd2;
68
0
  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
69
0
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
70
0
      && !defined __UCLIBC__) \
71
0
     || _LIBICONV_VERSION >= 0x0105 \
72
0
     || defined ICONV_SET_TRANSLITERATE
73
0
      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
74
0
# endif
75
0
     )
76
0
    cd2 = (iconv_t)(-1);
77
0
  else
78
0
    {
79
0
      cd2 = iconv_open (to_codeset, "UTF-8");
80
0
      if (cd2 == (iconv_t)(-1))
81
0
        {
82
0
          int saved_errno = errno;
83
0
          if (cd1 != (iconv_t)(-1))
84
0
            iconv_close (cd1);
85
0
          if (cd != (iconv_t)(-1))
86
0
            iconv_close (cd);
87
0
          errno = saved_errno;
88
0
          return -1;
89
0
        }
90
0
    }
91
92
0
  cdp->cd = cd;
93
0
  cdp->cd1 = cd1;
94
0
  cdp->cd2 = cd2;
95
0
  return 0;
96
0
}
97
98
int
99
iconveh_close (const iconveh_t *cd)
100
0
{
101
0
  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
102
0
    {
103
      /* Return -1, but preserve the errno from iconv_close.  */
104
0
      int saved_errno = errno;
105
0
      if (cd->cd1 != (iconv_t)(-1))
106
0
        iconv_close (cd->cd1);
107
0
      if (cd->cd != (iconv_t)(-1))
108
0
        iconv_close (cd->cd);
109
0
      errno = saved_errno;
110
0
      return -1;
111
0
    }
112
0
  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
113
0
    {
114
      /* Return -1, but preserve the errno from iconv_close.  */
115
0
      int saved_errno = errno;
116
0
      if (cd->cd != (iconv_t)(-1))
117
0
        iconv_close (cd->cd);
118
0
      errno = saved_errno;
119
0
      return -1;
120
0
    }
121
0
  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
122
0
    return -1;
123
0
  return 0;
124
0
}
125
126
/* iconv_carefully is like iconv, except that it stops as soon as it encounters
127
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
128
   it has incremented the input pointers past the error location.  */
129
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
130
     && !(defined __GLIBC__ && !defined __UCLIBC__)
131
/* NetBSD iconv() inserts a question mark if it cannot convert.
132
   Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
133
   known to prefer to fail rather than doing a lossy conversion.  */
134
static size_t
135
iconv_carefully (iconv_t cd,
136
                 const char **inbuf, size_t *inbytesleft,
137
                 char **outbuf, size_t *outbytesleft,
138
                 bool *incremented)
139
{
140
  const char *inptr = *inbuf;
141
  const char *inptr_end = inptr + *inbytesleft;
142
  char *outptr = *outbuf;
143
  size_t outsize = *outbytesleft;
144
  const char *inptr_before;
145
  size_t res;
146
147
  do
148
    {
149
      inptr_before = inptr;
150
      res = (size_t)(-1);
151
152
      for (size_t insize = 1; inptr + insize <= inptr_end; insize++)
153
        {
154
          res = iconv (cd,
155
                       (ICONV_CONST char **) &inptr, &insize,
156
                       &outptr, &outsize);
157
          if (!(res == (size_t)(-1) && errno == EINVAL))
158
            break;
159
          /* iconv can eat up a shift sequence but give EINVAL while attempting
160
             to convert the first character.  E.g. libiconv does this.  */
161
          if (inptr > inptr_before)
162
            {
163
              res = 0;
164
              break;
165
            }
166
        }
167
168
      if (res == 0)
169
        {
170
          *outbuf = outptr;
171
          *outbytesleft = outsize;
172
        }
173
    }
174
  while (res == 0 && inptr < inptr_end);
175
176
  *inbuf = inptr;
177
  *inbytesleft = inptr_end - inptr;
178
  if (res != (size_t)(-1) && res > 0)
179
    {
180
      /* iconv() has already incremented INPTR.  We cannot go back to a
181
         previous INPTR, otherwise the state inside CD would become invalid,
182
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
183
         *INBUF has already been incremented.  */
184
      *incremented = (inptr > inptr_before);
185
      errno = EILSEQ;
186
      return (size_t)(-1);
187
    }
188
  else
189
    {
190
      *incremented = false;
191
      return res;
192
    }
193
}
194
# else
195
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
196
0
     (*(incremented) = false, \
197
0
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
198
# endif
199
200
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
201
   converting one character or one shift sequence.  */
202
static size_t
203
iconv_carefully_1 (iconv_t cd,
204
                   const char **inbuf, size_t *inbytesleft,
205
                   char **outbuf, size_t *outbytesleft,
206
                   bool *incremented)
207
0
{
208
0
  const char *inptr_before = *inbuf;
209
0
  const char *inptr = inptr_before;
210
0
  const char *inptr_end = inptr_before + *inbytesleft;
211
0
  char *outptr = *outbuf;
212
0
  size_t outsize = *outbytesleft;
213
0
  size_t res = (size_t)(-1);
214
215
0
  for (size_t insize = 1; inptr_before + insize <= inptr_end; insize++)
216
0
    {
217
0
      inptr = inptr_before;
218
0
      res = iconv (cd,
219
0
                   (ICONV_CONST char **) &inptr, &insize,
220
0
                   &outptr, &outsize);
221
0
      if (!(res == (size_t)(-1) && errno == EINVAL))
222
0
        break;
223
      /* iconv can eat up a shift sequence but give EINVAL while attempting
224
         to convert the first character.  E.g. libiconv does this.  */
225
0
      if (inptr > inptr_before)
226
0
        {
227
0
          res = 0;
228
0
          break;
229
0
        }
230
0
    }
231
232
0
  *inbuf = inptr;
233
0
  *inbytesleft = inptr_end - inptr;
234
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
235
     && !(defined __GLIBC__ && !defined __UCLIBC__)
236
  /* NetBSD iconv() inserts a question mark if it cannot convert.
237
     Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
238
     known to prefer to fail rather than doing a lossy conversion.  */
239
  if (res != (size_t)(-1) && res > 0)
240
    {
241
      /* iconv() has already incremented INPTR.  We cannot go back to a
242
         previous INPTR, otherwise the state inside CD would become invalid,
243
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
244
         *INBUF has already been incremented.  */
245
      *incremented = (inptr > inptr_before);
246
      errno = EILSEQ;
247
      return (size_t)(-1);
248
    }
249
# endif
250
251
0
  if (res != (size_t)(-1))
252
0
    {
253
0
      *outbuf = outptr;
254
0
      *outbytesleft = outsize;
255
0
    }
256
0
  *incremented = false;
257
0
  return res;
258
0
}
259
260
/* utf8conv_carefully is like iconv, except that
261
     - it converts from UTF-8 to UTF-8,
262
     - it stops as soon as it encounters a conversion error, and it returns
263
       in *INCREMENTED a boolean telling whether it has incremented the input
264
       pointers past the error location,
265
     - if one_character_only is true, it stops after converting one
266
       character.  */
267
static size_t
268
utf8conv_carefully (bool one_character_only,
269
                    const char **inbuf, size_t *inbytesleft,
270
                    char **outbuf, size_t *outbytesleft,
271
                    bool *incremented)
272
0
{
273
0
  const char *inptr = *inbuf;
274
0
  size_t insize = *inbytesleft;
275
0
  char *outptr = *outbuf;
276
0
  size_t outsize = *outbytesleft;
277
0
  size_t res = 0;
278
0
  do
279
0
    {
280
0
      ucs4_t uc;
281
0
      int n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
282
0
      if (n < 0)
283
0
        {
284
0
          errno = (n == -2 ? EINVAL : EILSEQ);
285
0
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
286
0
          inptr += n;
287
0
          insize -= n;
288
0
          res = (size_t)(-1);
289
0
          *incremented = true;
290
0
          break;
291
0
        }
292
0
      if (outsize == 0)
293
0
        {
294
0
          errno = E2BIG;
295
0
          res = (size_t)(-1);
296
0
          *incremented = false;
297
0
          break;
298
0
        }
299
0
      int m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
300
0
      if (m == -2)
301
0
        {
302
0
          errno = E2BIG;
303
0
          res = (size_t)(-1);
304
0
          *incremented = false;
305
0
          break;
306
0
        }
307
0
      inptr += n;
308
0
      insize -= n;
309
0
      if (m == -1)
310
0
        {
311
0
          errno = EILSEQ;
312
0
          res = (size_t)(-1);
313
0
          *incremented = true;
314
0
          break;
315
0
        }
316
0
      outptr += m;
317
0
      outsize -= m;
318
0
    }
319
0
  while (!one_character_only && insize > 0);
320
321
0
  *inbuf = inptr;
322
0
  *inbytesleft = insize;
323
0
  *outbuf = outptr;
324
0
  *outbytesleft = outsize;
325
0
  return res;
326
0
}
327
328
static int
329
mem_cd_iconveh_internal (const char *src, size_t srclen,
330
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
331
                         enum iconv_ilseq_handler handler,
332
                         size_t extra_alloc,
333
                         size_t *offsets,
334
                         char **resultp, size_t *lengthp)
335
0
{
336
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
337
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
338
     Instead, we have to start afresh from the beginning of SRC.  */
339
  /* Use a temporary buffer, so that for small strings, a single malloc()
340
     call will be sufficient.  */
341
0
# define tmpbufsize 4096
342
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
343
     libiconv's UCS-4-INTERNAL encoding.  */
344
0
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
345
0
# define tmpbuf tmp.buf
346
347
0
  char *initial_result;
348
0
  size_t allocated;
349
0
  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
350
0
    {
351
0
      initial_result = *resultp;
352
0
      allocated = *lengthp;
353
0
    }
354
0
  else
355
0
    {
356
0
      initial_result = tmpbuf;
357
0
      allocated = sizeof (tmpbuf);
358
0
    }
359
360
0
  char *result = initial_result;
361
362
  /* Test whether a direct conversion is possible at all.  */
363
0
  if (cd == (iconv_t)(-1))
364
0
    goto indirectly;
365
366
0
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
367
0
  if (offsets != NULL)
368
0
    {
369
0
      for (size_t i = 0; i < srclen; i++)
370
0
        offsets[i] = (size_t)(-1);
371
372
0
      last_length = (size_t)(-1);
373
0
    }
374
0
  size_t length = 0;
375
376
  /* First, try a direct conversion, and see whether a conversion error
377
     occurs at all.  */
378
0
  {
379
    /* Set to the initial state.  */
380
0
    iconv (cd, NULL, NULL, NULL, NULL);
381
382
0
    const char *inptr = src;
383
0
    size_t insize = srclen;
384
385
0
    while (insize > 0)
386
0
      {
387
0
        char *outptr = result + length;
388
0
        size_t outsize = allocated - extra_alloc - length;
389
390
0
        bool incremented;
391
0
        size_t res;
392
0
        if (offsets != NULL)
393
0
          {
394
0
            if (length != last_length) /* ensure that offset[] be increasing */
395
0
              {
396
0
                offsets[inptr - src] = length;
397
0
                last_length = length;
398
0
              }
399
0
            res = iconv_carefully_1 (cd,
400
0
                                     &inptr, &insize,
401
0
                                     &outptr, &outsize,
402
0
                                     &incremented);
403
0
          }
404
0
        else
405
          /* Use iconv_carefully instead of iconv here, because:
406
             - If TO_CODESET is UTF-8, we can do the error handling in this
407
               loop, no need for a second loop,
408
             - With iconv() implementations other than GNU libiconv and GNU
409
               libc, if we use iconv() in a big swoop, checking for an E2BIG
410
               return, we lose the number of irreversible conversions.  */
411
0
          res = iconv_carefully (cd,
412
0
                                 &inptr, &insize,
413
0
                                 &outptr, &outsize,
414
0
                                 &incremented);
415
416
0
        length = outptr - result;
417
0
        bool grow = (length + extra_alloc > allocated / 2);
418
0
        if (res == (size_t)(-1))
419
0
          {
420
0
            if (errno == E2BIG)
421
0
              grow = true;
422
0
            else if (errno == EINVAL)
423
0
              break;
424
0
            else if (errno == EILSEQ && handler != iconveh_error)
425
0
              {
426
0
                if (cd2 == (iconv_t)(-1))
427
0
                  {
428
                    /* TO_CODESET is UTF-8.  */
429
                    /* Error handling can produce up to 1 or 3 bytes of
430
                       output.  */
431
0
                    size_t extra_need =
432
0
                      (handler == iconveh_replacement_character ? 3 : 1);
433
0
                    if (length + extra_need + extra_alloc > allocated)
434
0
                      {
435
0
                        allocated = 2 * allocated;
436
0
                        if (length + extra_need + extra_alloc > allocated)
437
0
                          allocated = 2 * allocated;
438
0
                        if (length + extra_need + extra_alloc > allocated)
439
0
                          abort ();
440
0
                        char *memory;
441
0
                        if (result == initial_result)
442
0
                          memory = (char *) malloc (allocated);
443
0
                        else
444
0
                          memory = (char *) realloc (result, allocated);
445
0
                        if (memory == NULL)
446
0
                          {
447
0
                            if (result != initial_result)
448
0
                              free (result);
449
0
                            errno = ENOMEM;
450
0
                            return -1;
451
0
                          }
452
0
                        if (result == initial_result)
453
0
                          memcpy (memory, initial_result, length);
454
0
                        result = memory;
455
0
                        grow = false;
456
0
                      }
457
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
458
                       and emit a replacement character or a question mark.  */
459
0
                    if (!incremented)
460
0
                      {
461
0
                        if (insize == 0)
462
0
                          abort ();
463
0
                        inptr++;
464
0
                        insize--;
465
0
                      }
466
0
                    if (handler == iconveh_replacement_character)
467
0
                      {
468
                        /* U+FFFD in UTF-8 encoding.  */
469
0
                        result[length+0] = '\357';
470
0
                        result[length+1] = '\277';
471
0
                        result[length+2] = '\275';
472
0
                        length += 3;
473
0
                      }
474
0
                    else
475
0
                      {
476
0
                        result[length] = '?';
477
0
                        length++;
478
0
                      }
479
0
                  }
480
0
                else
481
0
                  goto indirectly;
482
0
              }
483
0
            else
484
0
              {
485
0
                if (result != initial_result)
486
0
                  free (result);
487
0
                return -1;
488
0
              }
489
0
          }
490
0
        if (insize == 0)
491
0
          break;
492
0
        if (grow)
493
0
          {
494
0
            allocated = 2 * allocated;
495
0
            char *memory;
496
0
            if (result == initial_result)
497
0
              memory = (char *) malloc (allocated);
498
0
            else
499
0
              memory = (char *) realloc (result, allocated);
500
0
            if (memory == NULL)
501
0
              {
502
0
                if (result != initial_result)
503
0
                  free (result);
504
0
                errno = ENOMEM;
505
0
                return -1;
506
0
              }
507
0
            if (result == initial_result)
508
0
              memcpy (memory, initial_result, length);
509
0
            result = memory;
510
0
          }
511
0
      }
512
0
  }
513
514
  /* Now get the conversion state back to the initial state.
515
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
516
0
#if defined _LIBICONV_VERSION \
517
0
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
518
0
         || defined __sun)
519
0
  for (;;)
520
0
    {
521
0
      char *outptr = result + length;
522
0
      size_t outsize = allocated - extra_alloc - length;
523
0
      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
524
0
      length = outptr - result;
525
0
      if (res == (size_t)(-1))
526
0
        {
527
0
          if (errno == E2BIG)
528
0
            {
529
530
0
              allocated = 2 * allocated;
531
0
              char *memory;
532
0
              if (result == initial_result)
533
0
                memory = (char *) malloc (allocated);
534
0
              else
535
0
                memory = (char *) realloc (result, allocated);
536
0
              if (memory == NULL)
537
0
                {
538
0
                  if (result != initial_result)
539
0
                    free (result);
540
0
                  errno = ENOMEM;
541
0
                  return -1;
542
0
                }
543
0
              if (result == initial_result)
544
0
                memcpy (memory, initial_result, length);
545
0
              result = memory;
546
0
            }
547
0
          else
548
0
            {
549
0
              if (result != initial_result)
550
0
                free (result);
551
0
              return -1;
552
0
            }
553
0
        }
554
0
      else
555
0
        break;
556
0
    }
557
0
#endif
558
559
  /* The direct conversion succeeded.  */
560
0
  goto done;
561
562
0
 indirectly:
563
  /* The direct conversion failed.
564
     Use a conversion through UTF-8.  */
565
0
  if (offsets != NULL)
566
0
    {
567
0
      for (size_t i = 0; i < srclen; i++)
568
0
        offsets[i] = (size_t)(-1);
569
570
0
      last_length = (size_t)(-1);
571
0
    }
572
0
  length = 0;
573
0
  {
574
    /* Set to the initial state.  */
575
0
    if (cd1 != (iconv_t)(-1))
576
0
      iconv (cd1, NULL, NULL, NULL, NULL);
577
0
    if (cd2 != (iconv_t)(-1))
578
0
      iconv (cd2, NULL, NULL, NULL, NULL);
579
580
0
    const bool slowly = (offsets != NULL || handler == iconveh_error);
581
0
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
582
0
    char utf8buf[utf8bufsize + 3];
583
0
    size_t utf8len = 0;
584
0
    const char *in1ptr = src;
585
0
    size_t in1size = srclen;
586
0
    bool do_final_flush1 = true;
587
0
    bool do_final_flush2 = true;
588
589
0
    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
590
0
      {
591
0
        char *out1ptr = utf8buf + utf8len;
592
0
        size_t out1size = utf8bufsize - utf8len;
593
594
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
595
0
        bool incremented1;
596
0
        size_t res1;
597
0
        if (in1size > 0)
598
0
          {
599
0
            if (offsets != NULL
600
0
                && length != last_length) /* ensure that offset[] be increasing */
601
0
              {
602
0
                offsets[in1ptr - src] = length;
603
0
                last_length = length;
604
0
              }
605
0
            if (cd1 != (iconv_t)(-1))
606
0
              {
607
0
                if (slowly)
608
0
                  res1 = iconv_carefully_1 (cd1,
609
0
                                            &in1ptr, &in1size,
610
0
                                            &out1ptr, &out1size,
611
0
                                            &incremented1);
612
0
                else
613
0
                  res1 = iconv_carefully (cd1,
614
0
                                          &in1ptr, &in1size,
615
0
                                          &out1ptr, &out1size,
616
0
                                          &incremented1);
617
0
              }
618
0
            else
619
0
              {
620
                /* FROM_CODESET is UTF-8.  */
621
0
                res1 = utf8conv_carefully (slowly,
622
0
                                           &in1ptr, &in1size,
623
0
                                           &out1ptr, &out1size,
624
0
                                           &incremented1);
625
0
              }
626
0
          }
627
0
        else if (do_final_flush1)
628
0
          {
629
            /* Now get the conversion state of CD1 back to the initial state.
630
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
631
0
# if defined _LIBICONV_VERSION \
632
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
633
0
          || defined __sun)
634
0
            if (cd1 != (iconv_t)(-1))
635
0
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
636
0
            else
637
0
# endif
638
0
              res1 = 0;
639
0
            do_final_flush1 = false;
640
0
            incremented1 = true;
641
0
          }
642
0
        else
643
0
          {
644
0
            res1 = 0;
645
0
            incremented1 = true;
646
0
          }
647
0
        if (res1 == (size_t)(-1)
648
0
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
649
0
          {
650
0
            if (result != initial_result)
651
0
              free (result);
652
0
            return -1;
653
0
          }
654
0
        if (res1 == (size_t)(-1)
655
0
            && errno == EILSEQ && handler != iconveh_error)
656
0
          {
657
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
658
               emit a U+FFFD character or a question mark.  Room for this
659
               character was allocated at the end of utf8buf.  */
660
0
            if (!incremented1)
661
0
              {
662
0
                if (in1size == 0)
663
0
                  abort ();
664
0
                in1ptr++;
665
0
                in1size--;
666
0
              }
667
0
            if (handler == iconveh_replacement_character)
668
0
              {
669
                /* U+FFFD in UTF-8 encoding.  */
670
0
                out1ptr[0] = '\357';
671
0
                out1ptr[1] = '\277';
672
0
                out1ptr[2] = '\275';
673
0
                out1ptr += 3;
674
0
              }
675
0
            else
676
0
              *out1ptr++ = '?';
677
0
            res1 = 0;
678
0
          }
679
0
        int errno1 = errno;
680
0
        utf8len = out1ptr - utf8buf;
681
682
0
        if (offsets != NULL
683
0
            || in1size == 0
684
0
            || utf8len > utf8bufsize / 2
685
0
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
686
0
          {
687
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
688
0
            const char *in2ptr = utf8buf;
689
0
            size_t in2size = utf8len;
690
691
0
            while (in2size > 0
692
0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
693
0
              {
694
0
                char *out2ptr = result + length;
695
0
                size_t out2size = allocated - extra_alloc - length;
696
697
0
                bool incremented2;
698
0
                size_t res2;
699
0
                if (in2size > 0)
700
0
                  {
701
0
                    if (cd2 != (iconv_t)(-1))
702
0
                      res2 = iconv_carefully (cd2,
703
0
                                              &in2ptr, &in2size,
704
0
                                              &out2ptr, &out2size,
705
0
                                              &incremented2);
706
0
                    else
707
                      /* TO_CODESET is UTF-8.  */
708
0
                      res2 = utf8conv_carefully (false,
709
0
                                                 &in2ptr, &in2size,
710
0
                                                 &out2ptr, &out2size,
711
0
                                                 &incremented2);
712
0
                  }
713
0
                else /* in1size == 0 && !do_final_flush1
714
                        && in2size == 0 && do_final_flush2 */
715
0
                  {
716
                    /* Now get the conversion state of CD1 back to the initial
717
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
718
0
# if defined _LIBICONV_VERSION \
719
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
720
0
          || defined __sun)
721
0
                    if (cd2 != (iconv_t)(-1))
722
0
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
723
0
                    else
724
0
# endif
725
0
                      res2 = 0;
726
0
                    do_final_flush2 = false;
727
0
                    incremented2 = true;
728
0
                  }
729
730
0
                length = out2ptr - result;
731
0
                bool grow = (length + extra_alloc > allocated / 2);
732
0
                if (res2 == (size_t)(-1))
733
0
                  {
734
0
                    if (errno == E2BIG)
735
0
                      grow = true;
736
0
                    else if (errno == EINVAL)
737
0
                      break;
738
0
                    else if (errno == EILSEQ && handler != iconveh_error)
739
0
                      {
740
                        /* Error handling can produce up to 10 bytes of UTF-8
741
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
742
                           UCS-4, so use CD2 here as well.  */
743
0
                        ucs4_t uc;
744
745
0
                        if (incremented2)
746
0
                          {
747
0
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
748
0
                                         (const uint8_t *) utf8buf)
749
0
                                == NULL)
750
0
                              abort ();
751
0
                          }
752
0
                        else
753
0
                          {
754
0
                            int n;
755
0
                            if (in2size == 0)
756
0
                              abort ();
757
0
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
758
0
                                                  in2size);
759
0
                            in2ptr += n;
760
0
                            in2size -= n;
761
0
                          }
762
763
0
                        char scratchbuf[10];
764
0
                        size_t scratchlen;
765
0
                        if (handler == iconveh_escape_sequence)
766
0
                          {
767
0
                            static char const hex[16] _GL_ATTRIBUTE_NONSTRING =
768
0
                              "0123456789ABCDEF";
769
0
                            scratchlen = 0;
770
0
                            scratchbuf[scratchlen++] = '\\';
771
0
                            if (uc < 0x10000)
772
0
                              scratchbuf[scratchlen++] = 'u';
773
0
                            else
774
0
                              {
775
0
                                scratchbuf[scratchlen++] = 'U';
776
0
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
777
0
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
778
0
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
779
0
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
780
0
                              }
781
0
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
782
0
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
783
0
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
784
0
                            scratchbuf[scratchlen++] = hex[uc & 15];
785
0
                          }
786
0
                        else if (handler == iconveh_replacement_character)
787
0
                          {
788
                            /* U+FFFD in UTF-8 encoding.  */
789
0
                            scratchbuf[0] = '\357';
790
0
                            scratchbuf[1] = '\277';
791
0
                            scratchbuf[2] = '\275';
792
0
                            scratchlen = 3;
793
0
                          }
794
0
                        else
795
0
                          {
796
0
                            scratchbuf[0] = '?';
797
0
                            scratchlen = 1;
798
0
                          }
799
800
0
                        const char *inptr = scratchbuf;
801
0
                        size_t insize = scratchlen;
802
0
                        size_t res;
803
0
                        if (cd2 != (iconv_t)(-1))
804
0
                          {
805
0
                            char *out2ptr_try = out2ptr;
806
0
                            size_t out2size_try = out2size;
807
0
                            res = iconv (cd2,
808
0
                                         (ICONV_CONST char **) &inptr, &insize,
809
0
                                         &out2ptr_try, &out2size_try);
810
0
                            if (handler == iconveh_replacement_character
811
0
                                && (res == (size_t)(-1)
812
0
                                    ? errno == EILSEQ
813
                                    /* FreeBSD iconv(), NetBSD iconv(), and
814
                                       Solaris 11 iconv() insert a '?' if they
815
                                       cannot convert.  This is what we want.
816
                                       But musl libc iconv() inserts a '*' if it
817
                                       cannot convert.  */
818
0
                                    : (res > 0
819
0
                                       && !(out2ptr_try - out2ptr == 1
820
0
                                            && *out2ptr == '?'))))
821
0
                              {
822
                                /* The iconv() call failed.
823
                                   U+FFFD can't be converted to TO_CODESET.
824
                                   Use '?' instead.  */
825
0
                                scratchbuf[0] = '?';
826
0
                                scratchlen = 1;
827
0
                                inptr = scratchbuf;
828
0
                                insize = scratchlen;
829
0
                                res = iconv (cd2,
830
0
                                             (ICONV_CONST char **) &inptr, &insize,
831
0
                                             &out2ptr, &out2size);
832
0
                              }
833
0
                            else
834
0
                              {
835
                                /* Accept the results of the iconv() call.  */
836
0
                                out2ptr = out2ptr_try;
837
0
                                out2size = out2size_try;
838
0
                                res = 0;
839
0
                              }
840
0
                          }
841
0
                        else
842
0
                          {
843
                            /* TO_CODESET is UTF-8.  */
844
0
                            if (out2size >= insize)
845
0
                              {
846
0
                                memcpy (out2ptr, inptr, insize);
847
0
                                out2ptr += insize;
848
0
                                out2size -= insize;
849
0
                                inptr += insize;
850
0
                                insize = 0;
851
0
                                res = 0;
852
0
                              }
853
0
                            else
854
0
                              {
855
0
                                errno = E2BIG;
856
0
                                res = (size_t)(-1);
857
0
                              }
858
0
                          }
859
0
                        length = out2ptr - result;
860
0
                        if (res == (size_t)(-1) && errno == E2BIG)
861
0
                          {
862
0
                            allocated = 2 * allocated;
863
0
                            if (length + 1 + extra_alloc > allocated)
864
0
                              abort ();
865
0
                            char *memory;
866
0
                            if (result == initial_result)
867
0
                              memory = (char *) malloc (allocated);
868
0
                            else
869
0
                              memory = (char *) realloc (result, allocated);
870
0
                            if (memory == NULL)
871
0
                              {
872
0
                                if (result != initial_result)
873
0
                                  free (result);
874
0
                                errno = ENOMEM;
875
0
                                return -1;
876
0
                              }
877
0
                            if (result == initial_result)
878
0
                              memcpy (memory, initial_result, length);
879
0
                            result = memory;
880
0
                            grow = false;
881
882
0
                            out2ptr = result + length;
883
0
                            out2size = allocated - extra_alloc - length;
884
0
                            if (cd2 != (iconv_t)(-1))
885
0
                              res = iconv (cd2,
886
0
                                           (ICONV_CONST char **) &inptr,
887
0
                                           &insize,
888
0
                                           &out2ptr, &out2size);
889
0
                            else
890
0
                              {
891
                                /* TO_CODESET is UTF-8.  */
892
0
                                if (!(out2size >= insize))
893
0
                                  abort ();
894
0
                                memcpy (out2ptr, inptr, insize);
895
0
                                out2ptr += insize;
896
0
                                out2size -= insize;
897
0
                                inptr += insize;
898
0
                                insize = 0;
899
0
                                res = 0;
900
0
                              }
901
0
                            length = out2ptr - result;
902
0
                          }
903
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
904
     && !(defined __GLIBC__ && !defined __UCLIBC__)
905
                        /* FreeBSD iconv(), NetBSD iconv(), and Solaris 11
906
                           iconv() insert a '?' if they cannot convert.
907
                           musl libc iconv() inserts a '*' if it cannot convert.
908
                           Only GNU libiconv (excluding the bastard Apple iconv)
909
                           and GNU libc are known to prefer to fail rather than
910
                           doing a lossy conversion.  */
911
                        if (res != (size_t)(-1) && res > 0)
912
                          {
913
                            errno = EILSEQ;
914
                            res = (size_t)(-1);
915
                          }
916
# endif
917
0
                        if (res == (size_t)(-1))
918
0
                          {
919
                            /* Failure converting the ASCII replacement.  */
920
0
                            if (result != initial_result)
921
0
                              free (result);
922
0
                            return -1;
923
0
                          }
924
0
                      }
925
0
                    else
926
0
                      {
927
0
                        if (result != initial_result)
928
0
                          free (result);
929
0
                        return -1;
930
0
                      }
931
0
                  }
932
0
                if (!(in2size > 0
933
0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
934
0
                  break;
935
0
                if (grow)
936
0
                  {
937
0
                    allocated = 2 * allocated;
938
0
                    char *memory;
939
0
                    if (result == initial_result)
940
0
                      memory = (char *) malloc (allocated);
941
0
                    else
942
0
                      memory = (char *) realloc (result, allocated);
943
0
                    if (memory == NULL)
944
0
                      {
945
0
                        if (result != initial_result)
946
0
                          free (result);
947
0
                        errno = ENOMEM;
948
0
                        return -1;
949
0
                      }
950
0
                    if (result == initial_result)
951
0
                      memcpy (memory, initial_result, length);
952
0
                    result = memory;
953
0
                  }
954
0
              }
955
956
            /* Move the remaining bytes to the beginning of utf8buf.  */
957
0
            if (in2size > 0)
958
0
              memmove (utf8buf, in2ptr, in2size);
959
0
            utf8len = in2size;
960
0
          }
961
962
0
        if (res1 == (size_t)(-1))
963
0
          {
964
0
            if (errno1 == EINVAL)
965
0
              in1size = 0;
966
0
            else if (errno1 == EILSEQ)
967
0
              {
968
0
                if (result != initial_result)
969
0
                  free (result);
970
0
                errno = errno1;
971
0
                return -1;
972
0
              }
973
0
          }
974
0
      }
975
0
# undef utf8bufsize
976
0
  }
977
978
0
 done:
979
  /* Now the final memory allocation.  */
980
0
  if (result == tmpbuf)
981
0
    {
982
0
      size_t memsize = length + extra_alloc;
983
984
0
      if (*resultp != NULL && *lengthp >= memsize)
985
0
        result = *resultp;
986
0
      else
987
0
        {
988
0
          char *memory = (char *) malloc (memsize > 0 ? memsize : 1);
989
0
          if (memory != NULL)
990
0
            result = memory;
991
0
          else
992
0
            {
993
0
              errno = ENOMEM;
994
0
              return -1;
995
0
            }
996
0
        }
997
0
      memcpy (result, tmpbuf, length);
998
0
    }
999
0
  else if (result != *resultp && length + extra_alloc < allocated)
1000
0
    {
1001
      /* Shrink the allocated memory if possible.  */
1002
0
      size_t memsize = length + extra_alloc;
1003
0
      char *memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1004
0
      if (memory != NULL)
1005
0
        result = memory;
1006
0
    }
1007
0
  *resultp = result;
1008
0
  *lengthp = length;
1009
0
  return 0;
1010
0
# undef tmpbuf
1011
0
# undef tmpbufsize
1012
0
}
1013
1014
int
1015
mem_cd_iconveh (const char *src, size_t srclen,
1016
                const iconveh_t *cd,
1017
                enum iconv_ilseq_handler handler,
1018
                size_t *offsets,
1019
                char **resultp, size_t *lengthp)
1020
0
{
1021
0
  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1022
0
                                  handler, 0, offsets, resultp, lengthp);
1023
0
}
1024
1025
char *
1026
str_cd_iconveh (const char *src,
1027
                const iconveh_t *cd,
1028
                enum iconv_ilseq_handler handler)
1029
0
{
1030
  /* For most encodings, a trailing NUL byte in the input will be converted
1031
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1032
     function is usable for UTF-7, we have to exclude the NUL byte from the
1033
     conversion and add it by hand afterwards.  */
1034
0
  char *result = NULL;
1035
0
  size_t length = 0;
1036
0
  int retval = mem_cd_iconveh_internal (src, strlen (src),
1037
0
                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
1038
0
                                        NULL, &result, &length);
1039
1040
0
  if (retval < 0)
1041
0
    {
1042
0
      free (result);
1043
0
      return NULL;
1044
0
    }
1045
1046
  /* Add the terminating NUL byte.  */
1047
0
  result[length] = '\0';
1048
1049
0
  return result;
1050
0
}
1051
1052
#endif
1053
1054
int
1055
mem_iconveh (const char *src, size_t srclen,
1056
             const char *from_codeset, const char *to_codeset,
1057
             enum iconv_ilseq_handler handler,
1058
             size_t *offsets,
1059
             char **resultp, size_t *lengthp)
1060
0
{
1061
0
  if (srclen == 0)
1062
0
    {
1063
      /* Nothing to convert.  */
1064
0
      *lengthp = 0;
1065
0
      return 0;
1066
0
    }
1067
0
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1068
0
    {
1069
0
      char *result;
1070
1071
0
      if (*resultp != NULL && *lengthp >= srclen)
1072
0
        result = *resultp;
1073
0
      else
1074
0
        {
1075
0
          result = (char *) malloc (srclen);
1076
0
          if (result == NULL)
1077
0
            {
1078
0
              errno = ENOMEM;
1079
0
              return -1;
1080
0
            }
1081
0
        }
1082
0
      memcpy (result, src, srclen);
1083
0
      *resultp = result;
1084
0
      *lengthp = srclen;
1085
0
      return 0;
1086
0
    }
1087
0
  else
1088
0
    {
1089
0
#if HAVE_ICONV
1090
0
      iconveh_t cd;
1091
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1092
0
        return -1;
1093
1094
0
      char *result = *resultp;
1095
0
      size_t length = *lengthp;
1096
0
      int retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1097
0
                                   &result, &length);
1098
1099
0
      if (retval < 0)
1100
0
        {
1101
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1102
0
          int saved_errno = errno;
1103
0
          iconveh_close (&cd);
1104
0
          errno = saved_errno;
1105
0
        }
1106
0
      else
1107
0
        {
1108
0
          if (iconveh_close (&cd) < 0)
1109
0
            {
1110
0
              if (result != *resultp)
1111
0
                free (result);
1112
0
              return -1;
1113
0
            }
1114
0
          *resultp = result;
1115
0
          *lengthp = length;
1116
0
        }
1117
0
      return retval;
1118
#else
1119
      /* This is a different error code than if iconv_open existed but didn't
1120
         support from_codeset and to_codeset, so that the caller can emit
1121
         an error message such as
1122
           "iconv() is not supported. Installing GNU libiconv and
1123
            then reinstalling this package would fix this."  */
1124
      errno = ENOSYS;
1125
      return -1;
1126
#endif
1127
0
    }
1128
0
}
1129
1130
char *
1131
str_iconveh (const char *src,
1132
             const char *from_codeset, const char *to_codeset,
1133
             enum iconv_ilseq_handler handler)
1134
0
{
1135
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1136
0
    {
1137
0
      char *result = strdup (src);
1138
1139
0
      if (result == NULL)
1140
0
        errno = ENOMEM;
1141
0
      return result;
1142
0
    }
1143
0
  else
1144
0
    {
1145
0
#if HAVE_ICONV
1146
0
      iconveh_t cd;
1147
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1148
0
        return NULL;
1149
1150
0
      char *result = str_cd_iconveh (src, &cd, handler);
1151
1152
0
      if (result == NULL)
1153
0
        {
1154
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1155
0
          int saved_errno = errno;
1156
0
          iconveh_close (&cd);
1157
0
          errno = saved_errno;
1158
0
        }
1159
0
      else
1160
0
        {
1161
0
          if (iconveh_close (&cd) < 0)
1162
0
            {
1163
0
              free (result);
1164
0
              return NULL;
1165
0
            }
1166
0
        }
1167
0
      return result;
1168
#else
1169
      /* This is a different error code than if iconv_open existed but didn't
1170
         support from_codeset and to_codeset, so that the caller can emit
1171
         an error message such as
1172
           "iconv() is not supported. Installing GNU libiconv and
1173
            then reinstalling this package would fix this."  */
1174
      errno = ENOSYS;
1175
      return NULL;
1176
#endif
1177
0
    }
1178
0
}