Coverage Report

Created: 2024-07-23 07:36

/src/libunistring/lib/striconveh.c
Line
Count
Source (jump to first uncovered line)
1
/* Character set conversion with error handling.
2
   Copyright (C) 2001-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible and Simon Josefsson.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveh.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#if HAVE_ICONV
28
# include <iconv.h>
29
# include "unistr.h"
30
#endif
31
32
#include "c-strcase.h"
33
#include "c-strcaseeq.h"
34
35
#ifndef SIZE_MAX
36
# define SIZE_MAX ((size_t) -1)
37
#endif
38
39
40
#if HAVE_ICONV
41
42
/* The caller must provide an iconveh_t, not just an iconv_t, because when a
43
   conversion error occurs, we may have to determine the Unicode representation
44
   of the inconvertible character.  */
45
46
int
47
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
48
0
{
49
0
  iconv_t cd;
50
0
  iconv_t cd1;
51
0
  iconv_t cd2;
52
53
  /* Avoid glibc-2.1 bug with EUC-KR.  */
54
# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
55
     && !defined _LIBICONV_VERSION
56
  if (c_strcasecmp (from_codeset, "EUC-KR") == 0
57
      || c_strcasecmp (to_codeset, "EUC-KR") == 0)
58
    {
59
      errno = EINVAL;
60
      return -1;
61
    }
62
# endif
63
64
0
  cd = iconv_open (to_codeset, from_codeset);
65
66
0
  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
67
0
    cd1 = (iconv_t)(-1);
68
0
  else
69
0
    {
70
0
      cd1 = iconv_open ("UTF-8", from_codeset);
71
0
      if (cd1 == (iconv_t)(-1))
72
0
        {
73
0
          int saved_errno = errno;
74
0
          if (cd != (iconv_t)(-1))
75
0
            iconv_close (cd);
76
0
          errno = saved_errno;
77
0
          return -1;
78
0
        }
79
0
    }
80
81
0
  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
82
0
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
83
0
      && !defined __UCLIBC__) \
84
0
     || _LIBICONV_VERSION >= 0x0105 \
85
0
     || defined ICONV_SET_TRANSLITERATE
86
0
      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
87
0
# endif
88
0
     )
89
0
    cd2 = (iconv_t)(-1);
90
0
  else
91
0
    {
92
0
      cd2 = iconv_open (to_codeset, "UTF-8");
93
0
      if (cd2 == (iconv_t)(-1))
94
0
        {
95
0
          int saved_errno = errno;
96
0
          if (cd1 != (iconv_t)(-1))
97
0
            iconv_close (cd1);
98
0
          if (cd != (iconv_t)(-1))
99
0
            iconv_close (cd);
100
0
          errno = saved_errno;
101
0
          return -1;
102
0
        }
103
0
    }
104
105
0
  cdp->cd = cd;
106
0
  cdp->cd1 = cd1;
107
0
  cdp->cd2 = cd2;
108
0
  return 0;
109
0
}
110
111
int
112
iconveh_close (const iconveh_t *cd)
113
0
{
114
0
  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
115
0
    {
116
      /* Return -1, but preserve the errno from iconv_close.  */
117
0
      int saved_errno = errno;
118
0
      if (cd->cd1 != (iconv_t)(-1))
119
0
        iconv_close (cd->cd1);
120
0
      if (cd->cd != (iconv_t)(-1))
121
0
        iconv_close (cd->cd);
122
0
      errno = saved_errno;
123
0
      return -1;
124
0
    }
125
0
  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
126
0
    {
127
      /* Return -1, but preserve the errno from iconv_close.  */
128
0
      int saved_errno = errno;
129
0
      if (cd->cd != (iconv_t)(-1))
130
0
        iconv_close (cd->cd);
131
0
      errno = saved_errno;
132
0
      return -1;
133
0
    }
134
0
  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
135
0
    return -1;
136
0
  return 0;
137
0
}
138
139
/* iconv_carefully is like iconv, except that it stops as soon as it encounters
140
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
141
   it has incremented the input pointers past the error location.  */
142
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
143
     && !(defined __GLIBC__ && !defined __UCLIBC__)
144
/* Irix iconv() inserts a NUL byte if it cannot convert.
145
   NetBSD iconv() inserts a question mark if it cannot convert.
146
   Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
147
   known to prefer to fail rather than doing a lossy conversion.  */
148
static size_t
149
iconv_carefully (iconv_t cd,
150
                 const char **inbuf, size_t *inbytesleft,
151
                 char **outbuf, size_t *outbytesleft,
152
                 bool *incremented)
153
{
154
  const char *inptr = *inbuf;
155
  const char *inptr_end = inptr + *inbytesleft;
156
  char *outptr = *outbuf;
157
  size_t outsize = *outbytesleft;
158
  const char *inptr_before;
159
  size_t res;
160
161
  do
162
    {
163
      size_t insize;
164
165
      inptr_before = inptr;
166
      res = (size_t)(-1);
167
168
      for (insize = 1; inptr + insize <= inptr_end; insize++)
169
        {
170
          res = iconv (cd,
171
                       (ICONV_CONST char **) &inptr, &insize,
172
                       &outptr, &outsize);
173
          if (!(res == (size_t)(-1) && errno == EINVAL))
174
            break;
175
          /* iconv can eat up a shift sequence but give EINVAL while attempting
176
             to convert the first character.  E.g. libiconv does this.  */
177
          if (inptr > inptr_before)
178
            {
179
              res = 0;
180
              break;
181
            }
182
        }
183
184
      if (res == 0)
185
        {
186
          *outbuf = outptr;
187
          *outbytesleft = outsize;
188
        }
189
    }
190
  while (res == 0 && inptr < inptr_end);
191
192
  *inbuf = inptr;
193
  *inbytesleft = inptr_end - inptr;
194
  if (res != (size_t)(-1) && res > 0)
195
    {
196
      /* iconv() has already incremented INPTR.  We cannot go back to a
197
         previous INPTR, otherwise the state inside CD would become invalid,
198
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
199
         *INBUF has already been incremented.  */
200
      *incremented = (inptr > inptr_before);
201
      errno = EILSEQ;
202
      return (size_t)(-1);
203
    }
204
  else
205
    {
206
      *incremented = false;
207
      return res;
208
    }
209
}
210
# else
211
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
212
0
     (*(incremented) = false, \
213
0
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
214
# endif
215
216
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
217
   converting one character or one shift sequence.  */
218
static size_t
219
iconv_carefully_1 (iconv_t cd,
220
                   const char **inbuf, size_t *inbytesleft,
221
                   char **outbuf, size_t *outbytesleft,
222
                   bool *incremented)
223
0
{
224
0
  const char *inptr_before = *inbuf;
225
0
  const char *inptr = inptr_before;
226
0
  const char *inptr_end = inptr_before + *inbytesleft;
227
0
  char *outptr = *outbuf;
228
0
  size_t outsize = *outbytesleft;
229
0
  size_t res = (size_t)(-1);
230
0
  size_t insize;
231
232
0
  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
233
0
    {
234
0
      inptr = inptr_before;
235
0
      res = iconv (cd,
236
0
                   (ICONV_CONST char **) &inptr, &insize,
237
0
                   &outptr, &outsize);
238
0
      if (!(res == (size_t)(-1) && errno == EINVAL))
239
0
        break;
240
      /* iconv can eat up a shift sequence but give EINVAL while attempting
241
         to convert the first character.  E.g. libiconv does this.  */
242
0
      if (inptr > inptr_before)
243
0
        {
244
0
          res = 0;
245
0
          break;
246
0
        }
247
0
    }
248
249
0
  *inbuf = inptr;
250
0
  *inbytesleft = inptr_end - inptr;
251
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
252
     && !(defined __GLIBC__ && !defined __UCLIBC__)
253
  /* Irix iconv() inserts a NUL byte if it cannot convert.
254
     NetBSD iconv() inserts a question mark if it cannot convert.
255
     Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
256
     known to prefer to fail rather than doing a lossy conversion.  */
257
  if (res != (size_t)(-1) && res > 0)
258
    {
259
      /* iconv() has already incremented INPTR.  We cannot go back to a
260
         previous INPTR, otherwise the state inside CD would become invalid,
261
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
262
         *INBUF has already been incremented.  */
263
      *incremented = (inptr > inptr_before);
264
      errno = EILSEQ;
265
      return (size_t)(-1);
266
    }
267
# endif
268
269
0
  if (res != (size_t)(-1))
270
0
    {
271
0
      *outbuf = outptr;
272
0
      *outbytesleft = outsize;
273
0
    }
274
0
  *incremented = false;
275
0
  return res;
276
0
}
277
278
/* utf8conv_carefully is like iconv, except that
279
     - it converts from UTF-8 to UTF-8,
280
     - it stops as soon as it encounters a conversion error, and it returns
281
       in *INCREMENTED a boolean telling whether it has incremented the input
282
       pointers past the error location,
283
     - if one_character_only is true, it stops after converting one
284
       character.  */
285
static size_t
286
utf8conv_carefully (bool one_character_only,
287
                    const char **inbuf, size_t *inbytesleft,
288
                    char **outbuf, size_t *outbytesleft,
289
                    bool *incremented)
290
0
{
291
0
  const char *inptr = *inbuf;
292
0
  size_t insize = *inbytesleft;
293
0
  char *outptr = *outbuf;
294
0
  size_t outsize = *outbytesleft;
295
0
  size_t res;
296
297
0
  res = 0;
298
0
  do
299
0
    {
300
0
      ucs4_t uc;
301
0
      int n;
302
0
      int m;
303
304
0
      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
305
0
      if (n < 0)
306
0
        {
307
0
          errno = (n == -2 ? EINVAL : EILSEQ);
308
0
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
309
0
          inptr += n;
310
0
          insize -= n;
311
0
          res = (size_t)(-1);
312
0
          *incremented = true;
313
0
          break;
314
0
        }
315
0
      if (outsize == 0)
316
0
        {
317
0
          errno = E2BIG;
318
0
          res = (size_t)(-1);
319
0
          *incremented = false;
320
0
          break;
321
0
        }
322
0
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
323
0
      if (m == -2)
324
0
        {
325
0
          errno = E2BIG;
326
0
          res = (size_t)(-1);
327
0
          *incremented = false;
328
0
          break;
329
0
        }
330
0
      inptr += n;
331
0
      insize -= n;
332
0
      if (m == -1)
333
0
        {
334
0
          errno = EILSEQ;
335
0
          res = (size_t)(-1);
336
0
          *incremented = true;
337
0
          break;
338
0
        }
339
0
      outptr += m;
340
0
      outsize -= m;
341
0
    }
342
0
  while (!one_character_only && insize > 0);
343
344
0
  *inbuf = inptr;
345
0
  *inbytesleft = insize;
346
0
  *outbuf = outptr;
347
0
  *outbytesleft = outsize;
348
0
  return res;
349
0
}
350
351
static int
352
mem_cd_iconveh_internal (const char *src, size_t srclen,
353
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
354
                         enum iconv_ilseq_handler handler,
355
                         size_t extra_alloc,
356
                         size_t *offsets,
357
                         char **resultp, size_t *lengthp)
358
0
{
359
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
360
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
361
     Instead, we have to start afresh from the beginning of SRC.  */
362
  /* Use a temporary buffer, so that for small strings, a single malloc()
363
     call will be sufficient.  */
364
0
# define tmpbufsize 4096
365
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
366
     libiconv's UCS-4-INTERNAL encoding.  */
367
0
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
368
0
# define tmpbuf tmp.buf
369
370
0
  char *initial_result;
371
0
  char *result;
372
0
  size_t allocated;
373
0
  size_t length;
374
0
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
375
376
0
  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
377
0
    {
378
0
      initial_result = *resultp;
379
0
      allocated = *lengthp;
380
0
    }
381
0
  else
382
0
    {
383
0
      initial_result = tmpbuf;
384
0
      allocated = sizeof (tmpbuf);
385
0
    }
386
0
  result = initial_result;
387
388
  /* Test whether a direct conversion is possible at all.  */
389
0
  if (cd == (iconv_t)(-1))
390
0
    goto indirectly;
391
392
0
  if (offsets != NULL)
393
0
    {
394
0
      size_t i;
395
396
0
      for (i = 0; i < srclen; i++)
397
0
        offsets[i] = (size_t)(-1);
398
399
0
      last_length = (size_t)(-1);
400
0
    }
401
0
  length = 0;
402
403
  /* First, try a direct conversion, and see whether a conversion error
404
     occurs at all.  */
405
0
  {
406
0
    const char *inptr = src;
407
0
    size_t insize = srclen;
408
409
    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
410
0
# if defined _LIBICONV_VERSION \
411
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
412
0
          || defined __sun)
413
    /* Set to the initial state.  */
414
0
    iconv (cd, NULL, NULL, NULL, NULL);
415
0
# endif
416
417
0
    while (insize > 0)
418
0
      {
419
0
        char *outptr = result + length;
420
0
        size_t outsize = allocated - extra_alloc - length;
421
0
        bool incremented;
422
0
        size_t res;
423
0
        bool grow;
424
425
0
        if (offsets != NULL)
426
0
          {
427
0
            if (length != last_length) /* ensure that offset[] be increasing */
428
0
              {
429
0
                offsets[inptr - src] = length;
430
0
                last_length = length;
431
0
              }
432
0
            res = iconv_carefully_1 (cd,
433
0
                                     &inptr, &insize,
434
0
                                     &outptr, &outsize,
435
0
                                     &incremented);
436
0
          }
437
0
        else
438
          /* Use iconv_carefully instead of iconv here, because:
439
             - If TO_CODESET is UTF-8, we can do the error handling in this
440
               loop, no need for a second loop,
441
             - With iconv() implementations other than GNU libiconv and GNU
442
               libc, if we use iconv() in a big swoop, checking for an E2BIG
443
               return, we lose the number of irreversible conversions.  */
444
0
          res = iconv_carefully (cd,
445
0
                                 &inptr, &insize,
446
0
                                 &outptr, &outsize,
447
0
                                 &incremented);
448
449
0
        length = outptr - result;
450
0
        grow = (length + extra_alloc > allocated / 2);
451
0
        if (res == (size_t)(-1))
452
0
          {
453
0
            if (errno == E2BIG)
454
0
              grow = true;
455
0
            else if (errno == EINVAL)
456
0
              break;
457
0
            else if (errno == EILSEQ && handler != iconveh_error)
458
0
              {
459
0
                if (cd2 == (iconv_t)(-1))
460
0
                  {
461
                    /* TO_CODESET is UTF-8.  */
462
                    /* Error handling can produce up to 1 or 3 bytes of
463
                       output.  */
464
0
                    size_t extra_need =
465
0
                      (handler == iconveh_replacement_character ? 3 : 1);
466
0
                    if (length + extra_need + extra_alloc > allocated)
467
0
                      {
468
0
                        char *memory;
469
470
0
                        allocated = 2 * allocated;
471
0
                        if (length + extra_need + extra_alloc > allocated)
472
0
                          allocated = 2 * allocated;
473
0
                        if (length + extra_need + extra_alloc > allocated)
474
0
                          abort ();
475
0
                        if (result == initial_result)
476
0
                          memory = (char *) malloc (allocated);
477
0
                        else
478
0
                          memory = (char *) realloc (result, allocated);
479
0
                        if (memory == NULL)
480
0
                          {
481
0
                            if (result != initial_result)
482
0
                              free (result);
483
0
                            errno = ENOMEM;
484
0
                            return -1;
485
0
                          }
486
0
                        if (result == initial_result)
487
0
                          memcpy (memory, initial_result, length);
488
0
                        result = memory;
489
0
                        grow = false;
490
0
                      }
491
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
492
                       and emit a replacement character or a question mark.  */
493
0
                    if (!incremented)
494
0
                      {
495
0
                        if (insize == 0)
496
0
                          abort ();
497
0
                        inptr++;
498
0
                        insize--;
499
0
                      }
500
0
                    if (handler == iconveh_replacement_character)
501
0
                      {
502
                        /* U+FFFD in UTF-8 encoding.  */
503
0
                        result[length+0] = '\357';
504
0
                        result[length+1] = '\277';
505
0
                        result[length+2] = '\275';
506
0
                        length += 3;
507
0
                      }
508
0
                    else
509
0
                      {
510
0
                        result[length] = '?';
511
0
                        length++;
512
0
                      }
513
0
                  }
514
0
                else
515
0
                  goto indirectly;
516
0
              }
517
0
            else
518
0
              {
519
0
                if (result != initial_result)
520
0
                  free (result);
521
0
                return -1;
522
0
              }
523
0
          }
524
0
        if (insize == 0)
525
0
          break;
526
0
        if (grow)
527
0
          {
528
0
            char *memory;
529
530
0
            allocated = 2 * allocated;
531
0
            if (result == initial_result)
532
0
              memory = (char *) malloc (allocated);
533
0
            else
534
0
              memory = (char *) realloc (result, allocated);
535
0
            if (memory == NULL)
536
0
              {
537
0
                if (result != initial_result)
538
0
                  free (result);
539
0
                errno = ENOMEM;
540
0
                return -1;
541
0
              }
542
0
            if (result == initial_result)
543
0
              memcpy (memory, initial_result, length);
544
0
            result = memory;
545
0
          }
546
0
      }
547
0
  }
548
549
  /* Now get the conversion state back to the initial state.
550
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
551
0
#if defined _LIBICONV_VERSION \
552
0
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
553
0
         || defined __sun)
554
0
  for (;;)
555
0
    {
556
0
      char *outptr = result + length;
557
0
      size_t outsize = allocated - extra_alloc - length;
558
0
      size_t res;
559
560
0
      res = iconv (cd, NULL, NULL, &outptr, &outsize);
561
0
      length = outptr - result;
562
0
      if (res == (size_t)(-1))
563
0
        {
564
0
          if (errno == E2BIG)
565
0
            {
566
0
              char *memory;
567
568
0
              allocated = 2 * allocated;
569
0
              if (result == initial_result)
570
0
                memory = (char *) malloc (allocated);
571
0
              else
572
0
                memory = (char *) realloc (result, allocated);
573
0
              if (memory == NULL)
574
0
                {
575
0
                  if (result != initial_result)
576
0
                    free (result);
577
0
                  errno = ENOMEM;
578
0
                  return -1;
579
0
                }
580
0
              if (result == initial_result)
581
0
                memcpy (memory, initial_result, length);
582
0
              result = memory;
583
0
            }
584
0
          else
585
0
            {
586
0
              if (result != initial_result)
587
0
                free (result);
588
0
              return -1;
589
0
            }
590
0
        }
591
0
      else
592
0
        break;
593
0
    }
594
0
#endif
595
596
  /* The direct conversion succeeded.  */
597
0
  goto done;
598
599
0
 indirectly:
600
  /* The direct conversion failed.
601
     Use a conversion through UTF-8.  */
602
0
  if (offsets != NULL)
603
0
    {
604
0
      size_t i;
605
606
0
      for (i = 0; i < srclen; i++)
607
0
        offsets[i] = (size_t)(-1);
608
609
0
      last_length = (size_t)(-1);
610
0
    }
611
0
  length = 0;
612
0
  {
613
0
    const bool slowly = (offsets != NULL || handler == iconveh_error);
614
0
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
615
0
    char utf8buf[utf8bufsize + 3];
616
0
    size_t utf8len = 0;
617
0
    const char *in1ptr = src;
618
0
    size_t in1size = srclen;
619
0
    bool do_final_flush1 = true;
620
0
    bool do_final_flush2 = true;
621
622
    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
623
0
# if defined _LIBICONV_VERSION \
624
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
625
0
          || defined __sun)
626
    /* Set to the initial state.  */
627
0
    if (cd1 != (iconv_t)(-1))
628
0
      iconv (cd1, NULL, NULL, NULL, NULL);
629
0
    if (cd2 != (iconv_t)(-1))
630
0
      iconv (cd2, NULL, NULL, NULL, NULL);
631
0
# endif
632
633
0
    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
634
0
      {
635
0
        char *out1ptr = utf8buf + utf8len;
636
0
        size_t out1size = utf8bufsize - utf8len;
637
0
        bool incremented1;
638
0
        size_t res1;
639
0
        int errno1;
640
641
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
642
0
        if (in1size > 0)
643
0
          {
644
0
            if (offsets != NULL
645
0
                && length != last_length) /* ensure that offset[] be increasing */
646
0
              {
647
0
                offsets[in1ptr - src] = length;
648
0
                last_length = length;
649
0
              }
650
0
            if (cd1 != (iconv_t)(-1))
651
0
              {
652
0
                if (slowly)
653
0
                  res1 = iconv_carefully_1 (cd1,
654
0
                                            &in1ptr, &in1size,
655
0
                                            &out1ptr, &out1size,
656
0
                                            &incremented1);
657
0
                else
658
0
                  res1 = iconv_carefully (cd1,
659
0
                                          &in1ptr, &in1size,
660
0
                                          &out1ptr, &out1size,
661
0
                                          &incremented1);
662
0
              }
663
0
            else
664
0
              {
665
                /* FROM_CODESET is UTF-8.  */
666
0
                res1 = utf8conv_carefully (slowly,
667
0
                                           &in1ptr, &in1size,
668
0
                                           &out1ptr, &out1size,
669
0
                                           &incremented1);
670
0
              }
671
0
          }
672
0
        else if (do_final_flush1)
673
0
          {
674
            /* Now get the conversion state of CD1 back to the initial state.
675
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
676
0
# if defined _LIBICONV_VERSION \
677
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
678
0
          || defined __sun)
679
0
            if (cd1 != (iconv_t)(-1))
680
0
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
681
0
            else
682
0
# endif
683
0
              res1 = 0;
684
0
            do_final_flush1 = false;
685
0
            incremented1 = true;
686
0
          }
687
0
        else
688
0
          {
689
0
            res1 = 0;
690
0
            incremented1 = true;
691
0
          }
692
0
        if (res1 == (size_t)(-1)
693
0
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
694
0
          {
695
0
            if (result != initial_result)
696
0
              free (result);
697
0
            return -1;
698
0
          }
699
0
        if (res1 == (size_t)(-1)
700
0
            && errno == EILSEQ && handler != iconveh_error)
701
0
          {
702
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
703
               emit a U+FFFD character or a question mark.  Room for this
704
               character was allocated at the end of utf8buf.  */
705
0
            if (!incremented1)
706
0
              {
707
0
                if (in1size == 0)
708
0
                  abort ();
709
0
                in1ptr++;
710
0
                in1size--;
711
0
              }
712
0
            if (handler == iconveh_replacement_character)
713
0
              {
714
                /* U+FFFD in UTF-8 encoding.  */
715
0
                out1ptr[0] = '\357';
716
0
                out1ptr[1] = '\277';
717
0
                out1ptr[2] = '\275';
718
0
                out1ptr += 3;
719
0
              }
720
0
            else
721
0
              *out1ptr++ = '?';
722
0
            res1 = 0;
723
0
          }
724
0
        errno1 = errno;
725
0
        utf8len = out1ptr - utf8buf;
726
727
0
        if (offsets != NULL
728
0
            || in1size == 0
729
0
            || utf8len > utf8bufsize / 2
730
0
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
731
0
          {
732
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
733
0
            const char *in2ptr = utf8buf;
734
0
            size_t in2size = utf8len;
735
736
0
            while (in2size > 0
737
0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
738
0
              {
739
0
                char *out2ptr = result + length;
740
0
                size_t out2size = allocated - extra_alloc - length;
741
0
                bool incremented2;
742
0
                size_t res2;
743
0
                bool grow;
744
745
0
                if (in2size > 0)
746
0
                  {
747
0
                    if (cd2 != (iconv_t)(-1))
748
0
                      res2 = iconv_carefully (cd2,
749
0
                                              &in2ptr, &in2size,
750
0
                                              &out2ptr, &out2size,
751
0
                                              &incremented2);
752
0
                    else
753
                      /* TO_CODESET is UTF-8.  */
754
0
                      res2 = utf8conv_carefully (false,
755
0
                                                 &in2ptr, &in2size,
756
0
                                                 &out2ptr, &out2size,
757
0
                                                 &incremented2);
758
0
                  }
759
0
                else /* in1size == 0 && !do_final_flush1
760
                        && in2size == 0 && do_final_flush2 */
761
0
                  {
762
                    /* Now get the conversion state of CD1 back to the initial
763
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
764
0
# if defined _LIBICONV_VERSION \
765
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
766
0
          || defined __sun)
767
0
                    if (cd2 != (iconv_t)(-1))
768
0
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
769
0
                    else
770
0
# endif
771
0
                      res2 = 0;
772
0
                    do_final_flush2 = false;
773
0
                    incremented2 = true;
774
0
                  }
775
776
0
                length = out2ptr - result;
777
0
                grow = (length + extra_alloc > allocated / 2);
778
0
                if (res2 == (size_t)(-1))
779
0
                  {
780
0
                    if (errno == E2BIG)
781
0
                      grow = true;
782
0
                    else if (errno == EINVAL)
783
0
                      break;
784
0
                    else if (errno == EILSEQ && handler != iconveh_error)
785
0
                      {
786
                        /* Error handling can produce up to 10 bytes of UTF-8
787
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
788
                           UCS-4, so use CD2 here as well.  */
789
0
                        char scratchbuf[10];
790
0
                        size_t scratchlen;
791
0
                        ucs4_t uc;
792
0
                        const char *inptr;
793
0
                        size_t insize;
794
0
                        size_t res;
795
796
0
                        if (incremented2)
797
0
                          {
798
0
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
799
0
                                         (const uint8_t *) utf8buf)
800
0
                                == NULL)
801
0
                              abort ();
802
0
                          }
803
0
                        else
804
0
                          {
805
0
                            int n;
806
0
                            if (in2size == 0)
807
0
                              abort ();
808
0
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
809
0
                                                  in2size);
810
0
                            in2ptr += n;
811
0
                            in2size -= n;
812
0
                          }
813
814
0
                        if (handler == iconveh_escape_sequence)
815
0
                          {
816
0
                            static char const hex[16] = "0123456789ABCDEF";
817
0
                            scratchlen = 0;
818
0
                            scratchbuf[scratchlen++] = '\\';
819
0
                            if (uc < 0x10000)
820
0
                              scratchbuf[scratchlen++] = 'u';
821
0
                            else
822
0
                              {
823
0
                                scratchbuf[scratchlen++] = 'U';
824
0
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
825
0
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
826
0
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
827
0
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
828
0
                              }
829
0
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
830
0
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
831
0
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
832
0
                            scratchbuf[scratchlen++] = hex[uc & 15];
833
0
                          }
834
0
                        else if (handler == iconveh_replacement_character)
835
0
                          {
836
                            /* U+FFFD in UTF-8 encoding.  */
837
0
                            scratchbuf[0] = '\357';
838
0
                            scratchbuf[1] = '\277';
839
0
                            scratchbuf[2] = '\275';
840
0
                            scratchlen = 3;
841
0
                          }
842
0
                        else
843
0
                          {
844
0
                            scratchbuf[0] = '?';
845
0
                            scratchlen = 1;
846
0
                          }
847
848
0
                        inptr = scratchbuf;
849
0
                        insize = scratchlen;
850
0
                        if (cd2 != (iconv_t)(-1))
851
0
                          {
852
0
                            char *out2ptr_try = out2ptr;
853
0
                            size_t out2size_try = out2size;
854
0
                            res = iconv (cd2,
855
0
                                         (ICONV_CONST char **) &inptr, &insize,
856
0
                                         &out2ptr_try, &out2size_try);
857
0
                            if (handler == iconveh_replacement_character
858
0
                                && (res == (size_t)(-1)
859
0
                                    ? errno == EILSEQ
860
                                    /* FreeBSD iconv(), NetBSD iconv(), and
861
                                       Solaris 11 iconv() insert a '?' if they
862
                                       cannot convert.  This is what we want.
863
                                       But IRIX iconv() inserts a NUL byte if it
864
                                       cannot convert.
865
                                       And musl libc iconv() inserts a '*' if it
866
                                       cannot convert.  */
867
0
                                    : (res > 0
868
0
                                       && !(out2ptr_try - out2ptr == 1
869
0
                                            && *out2ptr == '?'))))
870
0
                              {
871
                                /* The iconv() call failed.
872
                                   U+FFFD can't be converted to TO_CODESET.
873
                                   Use '?' instead.  */
874
0
                                scratchbuf[0] = '?';
875
0
                                scratchlen = 1;
876
0
                                inptr = scratchbuf;
877
0
                                insize = scratchlen;
878
0
                                res = iconv (cd2,
879
0
                                             (ICONV_CONST char **) &inptr, &insize,
880
0
                                             &out2ptr, &out2size);
881
0
                              }
882
0
                            else
883
0
                              {
884
                                /* Accept the results of the iconv() call.  */
885
0
                                out2ptr = out2ptr_try;
886
0
                                out2size = out2size_try;
887
0
                                res = 0;
888
0
                              }
889
0
                          }
890
0
                        else
891
0
                          {
892
                            /* TO_CODESET is UTF-8.  */
893
0
                            if (out2size >= insize)
894
0
                              {
895
0
                                memcpy (out2ptr, inptr, insize);
896
0
                                out2ptr += insize;
897
0
                                out2size -= insize;
898
0
                                inptr += insize;
899
0
                                insize = 0;
900
0
                                res = 0;
901
0
                              }
902
0
                            else
903
0
                              {
904
0
                                errno = E2BIG;
905
0
                                res = (size_t)(-1);
906
0
                              }
907
0
                          }
908
0
                        length = out2ptr - result;
909
0
                        if (res == (size_t)(-1) && errno == E2BIG)
910
0
                          {
911
0
                            char *memory;
912
913
0
                            allocated = 2 * allocated;
914
0
                            if (length + 1 + extra_alloc > allocated)
915
0
                              abort ();
916
0
                            if (result == initial_result)
917
0
                              memory = (char *) malloc (allocated);
918
0
                            else
919
0
                              memory = (char *) realloc (result, allocated);
920
0
                            if (memory == NULL)
921
0
                              {
922
0
                                if (result != initial_result)
923
0
                                  free (result);
924
0
                                errno = ENOMEM;
925
0
                                return -1;
926
0
                              }
927
0
                            if (result == initial_result)
928
0
                              memcpy (memory, initial_result, length);
929
0
                            result = memory;
930
0
                            grow = false;
931
932
0
                            out2ptr = result + length;
933
0
                            out2size = allocated - extra_alloc - length;
934
0
                            if (cd2 != (iconv_t)(-1))
935
0
                              res = iconv (cd2,
936
0
                                           (ICONV_CONST char **) &inptr,
937
0
                                           &insize,
938
0
                                           &out2ptr, &out2size);
939
0
                            else
940
0
                              {
941
                                /* TO_CODESET is UTF-8.  */
942
0
                                if (!(out2size >= insize))
943
0
                                  abort ();
944
0
                                memcpy (out2ptr, inptr, insize);
945
0
                                out2ptr += insize;
946
0
                                out2size -= insize;
947
0
                                inptr += insize;
948
0
                                insize = 0;
949
0
                                res = 0;
950
0
                              }
951
0
                            length = out2ptr - result;
952
0
                          }
953
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
954
     && !(defined __GLIBC__ && !defined __UCLIBC__)
955
                        /* IRIX iconv() inserts a NUL byte if it cannot convert.
956
                           FreeBSD iconv(), NetBSD iconv(), and Solaris 11
957
                           iconv() insert a '?' if they cannot convert.
958
                           musl libc iconv() inserts a '*' if it cannot convert.
959
                           Only GNU libiconv (excluding the bastard Apple iconv)
960
                           and GNU libc are known to prefer to fail rather than
961
                           doing a lossy conversion.  */
962
                        if (res != (size_t)(-1) && res > 0)
963
                          {
964
                            errno = EILSEQ;
965
                            res = (size_t)(-1);
966
                          }
967
# endif
968
0
                        if (res == (size_t)(-1))
969
0
                          {
970
                            /* Failure converting the ASCII replacement.  */
971
0
                            if (result != initial_result)
972
0
                              free (result);
973
0
                            return -1;
974
0
                          }
975
0
                      }
976
0
                    else
977
0
                      {
978
0
                        if (result != initial_result)
979
0
                          free (result);
980
0
                        return -1;
981
0
                      }
982
0
                  }
983
0
                if (!(in2size > 0
984
0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
985
0
                  break;
986
0
                if (grow)
987
0
                  {
988
0
                    char *memory;
989
990
0
                    allocated = 2 * allocated;
991
0
                    if (result == initial_result)
992
0
                      memory = (char *) malloc (allocated);
993
0
                    else
994
0
                      memory = (char *) realloc (result, allocated);
995
0
                    if (memory == NULL)
996
0
                      {
997
0
                        if (result != initial_result)
998
0
                          free (result);
999
0
                        errno = ENOMEM;
1000
0
                        return -1;
1001
0
                      }
1002
0
                    if (result == initial_result)
1003
0
                      memcpy (memory, initial_result, length);
1004
0
                    result = memory;
1005
0
                  }
1006
0
              }
1007
1008
            /* Move the remaining bytes to the beginning of utf8buf.  */
1009
0
            if (in2size > 0)
1010
0
              memmove (utf8buf, in2ptr, in2size);
1011
0
            utf8len = in2size;
1012
0
          }
1013
1014
0
        if (res1 == (size_t)(-1))
1015
0
          {
1016
0
            if (errno1 == EINVAL)
1017
0
              in1size = 0;
1018
0
            else if (errno1 == EILSEQ)
1019
0
              {
1020
0
                if (result != initial_result)
1021
0
                  free (result);
1022
0
                errno = errno1;
1023
0
                return -1;
1024
0
              }
1025
0
          }
1026
0
      }
1027
0
# undef utf8bufsize
1028
0
  }
1029
1030
0
 done:
1031
  /* Now the final memory allocation.  */
1032
0
  if (result == tmpbuf)
1033
0
    {
1034
0
      size_t memsize = length + extra_alloc;
1035
1036
0
      if (*resultp != NULL && *lengthp >= memsize)
1037
0
        result = *resultp;
1038
0
      else
1039
0
        {
1040
0
          char *memory;
1041
1042
0
          memory = (char *) malloc (memsize > 0 ? memsize : 1);
1043
0
          if (memory != NULL)
1044
0
            result = memory;
1045
0
          else
1046
0
            {
1047
0
              errno = ENOMEM;
1048
0
              return -1;
1049
0
            }
1050
0
        }
1051
0
      memcpy (result, tmpbuf, length);
1052
0
    }
1053
0
  else if (result != *resultp && length + extra_alloc < allocated)
1054
0
    {
1055
      /* Shrink the allocated memory if possible.  */
1056
0
      size_t memsize = length + extra_alloc;
1057
0
      char *memory;
1058
1059
0
      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1060
0
      if (memory != NULL)
1061
0
        result = memory;
1062
0
    }
1063
0
  *resultp = result;
1064
0
  *lengthp = length;
1065
0
  return 0;
1066
0
# undef tmpbuf
1067
0
# undef tmpbufsize
1068
0
}
1069
1070
int
1071
mem_cd_iconveh (const char *src, size_t srclen,
1072
                const iconveh_t *cd,
1073
                enum iconv_ilseq_handler handler,
1074
                size_t *offsets,
1075
                char **resultp, size_t *lengthp)
1076
0
{
1077
0
  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1078
0
                                  handler, 0, offsets, resultp, lengthp);
1079
0
}
1080
1081
char *
1082
str_cd_iconveh (const char *src,
1083
                const iconveh_t *cd,
1084
                enum iconv_ilseq_handler handler)
1085
0
{
1086
  /* For most encodings, a trailing NUL byte in the input will be converted
1087
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1088
     function is usable for UTF-7, we have to exclude the NUL byte from the
1089
     conversion and add it by hand afterwards.  */
1090
0
  char *result = NULL;
1091
0
  size_t length = 0;
1092
0
  int retval = mem_cd_iconveh_internal (src, strlen (src),
1093
0
                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
1094
0
                                        NULL, &result, &length);
1095
1096
0
  if (retval < 0)
1097
0
    {
1098
0
      free (result);
1099
0
      return NULL;
1100
0
    }
1101
1102
  /* Add the terminating NUL byte.  */
1103
0
  result[length] = '\0';
1104
1105
0
  return result;
1106
0
}
1107
1108
#endif
1109
1110
int
1111
mem_iconveh (const char *src, size_t srclen,
1112
             const char *from_codeset, const char *to_codeset,
1113
             enum iconv_ilseq_handler handler,
1114
             size_t *offsets,
1115
             char **resultp, size_t *lengthp)
1116
0
{
1117
0
  if (srclen == 0)
1118
0
    {
1119
      /* Nothing to convert.  */
1120
0
      *lengthp = 0;
1121
0
      return 0;
1122
0
    }
1123
0
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1124
0
    {
1125
0
      char *result;
1126
1127
0
      if (*resultp != NULL && *lengthp >= srclen)
1128
0
        result = *resultp;
1129
0
      else
1130
0
        {
1131
0
          result = (char *) malloc (srclen);
1132
0
          if (result == NULL)
1133
0
            {
1134
0
              errno = ENOMEM;
1135
0
              return -1;
1136
0
            }
1137
0
        }
1138
0
      memcpy (result, src, srclen);
1139
0
      *resultp = result;
1140
0
      *lengthp = srclen;
1141
0
      return 0;
1142
0
    }
1143
0
  else
1144
0
    {
1145
0
#if HAVE_ICONV
1146
0
      iconveh_t cd;
1147
0
      char *result;
1148
0
      size_t length;
1149
0
      int retval;
1150
1151
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1152
0
        return -1;
1153
1154
0
      result = *resultp;
1155
0
      length = *lengthp;
1156
0
      retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1157
0
                               &result, &length);
1158
1159
0
      if (retval < 0)
1160
0
        {
1161
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1162
0
          int saved_errno = errno;
1163
0
          iconveh_close (&cd);
1164
0
          errno = saved_errno;
1165
0
        }
1166
0
      else
1167
0
        {
1168
0
          if (iconveh_close (&cd) < 0)
1169
0
            {
1170
0
              if (result != *resultp)
1171
0
                free (result);
1172
0
              return -1;
1173
0
            }
1174
0
          *resultp = result;
1175
0
          *lengthp = length;
1176
0
        }
1177
0
      return retval;
1178
#else
1179
      /* This is a different error code than if iconv_open existed but didn't
1180
         support from_codeset and to_codeset, so that the caller can emit
1181
         an error message such as
1182
           "iconv() is not supported. Installing GNU libiconv and
1183
            then reinstalling this package would fix this."  */
1184
      errno = ENOSYS;
1185
      return -1;
1186
#endif
1187
0
    }
1188
0
}
1189
1190
char *
1191
str_iconveh (const char *src,
1192
             const char *from_codeset, const char *to_codeset,
1193
             enum iconv_ilseq_handler handler)
1194
0
{
1195
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1196
0
    {
1197
0
      char *result = strdup (src);
1198
1199
0
      if (result == NULL)
1200
0
        errno = ENOMEM;
1201
0
      return result;
1202
0
    }
1203
0
  else
1204
0
    {
1205
0
#if HAVE_ICONV
1206
0
      iconveh_t cd;
1207
0
      char *result;
1208
1209
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1210
0
        return NULL;
1211
1212
0
      result = str_cd_iconveh (src, &cd, handler);
1213
1214
0
      if (result == NULL)
1215
0
        {
1216
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1217
0
          int saved_errno = errno;
1218
0
          iconveh_close (&cd);
1219
0
          errno = saved_errno;
1220
0
        }
1221
0
      else
1222
0
        {
1223
0
          if (iconveh_close (&cd) < 0)
1224
0
            {
1225
0
              free (result);
1226
0
              return NULL;
1227
0
            }
1228
0
        }
1229
0
      return result;
1230
#else
1231
      /* This is a different error code than if iconv_open existed but didn't
1232
         support from_codeset and to_codeset, so that the caller can emit
1233
         an error message such as
1234
           "iconv() is not supported. Installing GNU libiconv and
1235
            then reinstalling this package would fix this."  */
1236
      errno = ENOSYS;
1237
      return NULL;
1238
#endif
1239
0
    }
1240
0
}