Coverage Report

Created: 2023-03-26 07:33

/src/libunistring/lib/striconveh.c
Line
Count
Source (jump to first uncovered line)
1
/* Character set conversion with error handling.
2
   Copyright (C) 2001-2022 Free Software Foundation, Inc.
3
   Written by Bruno Haible and Simon Josefsson.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveh.h"
22
23
#include <errno.h>
24
#include <stdbool.h>
25
#include <stdlib.h>
26
#include <string.h>
27
28
#if HAVE_ICONV
29
# include <iconv.h>
30
# include "unistr.h"
31
#endif
32
33
#include "c-strcase.h"
34
#include "c-strcaseeq.h"
35
36
#ifndef SIZE_MAX
37
# define SIZE_MAX ((size_t) -1)
38
#endif
39
40
41
#if HAVE_ICONV
42
43
/* The caller must provide an iconveh_t, not just an iconv_t, because when a
44
   conversion error occurs, we may have to determine the Unicode representation
45
   of the inconvertible character.  */
46
47
int
48
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49
0
{
50
0
  iconv_t cd;
51
0
  iconv_t cd1;
52
0
  iconv_t cd2;
53
54
  /* Avoid glibc-2.1 bug with EUC-KR.  */
55
# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
56
     && !defined _LIBICONV_VERSION
57
  if (c_strcasecmp (from_codeset, "EUC-KR") == 0
58
      || c_strcasecmp (to_codeset, "EUC-KR") == 0)
59
    {
60
      errno = EINVAL;
61
      return -1;
62
    }
63
# endif
64
65
0
  cd = iconv_open (to_codeset, from_codeset);
66
67
0
  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
68
0
    cd1 = (iconv_t)(-1);
69
0
  else
70
0
    {
71
0
      cd1 = iconv_open ("UTF-8", from_codeset);
72
0
      if (cd1 == (iconv_t)(-1))
73
0
        {
74
0
          int saved_errno = errno;
75
0
          if (cd != (iconv_t)(-1))
76
0
            iconv_close (cd);
77
0
          errno = saved_errno;
78
0
          return -1;
79
0
        }
80
0
    }
81
82
0
  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
83
0
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
84
0
      && !defined __UCLIBC__) \
85
0
     || _LIBICONV_VERSION >= 0x0105
86
0
      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
87
0
# endif
88
0
     )
89
0
    cd2 = (iconv_t)(-1);
90
0
  else
91
0
    {
92
0
      cd2 = iconv_open (to_codeset, "UTF-8");
93
0
      if (cd2 == (iconv_t)(-1))
94
0
        {
95
0
          int saved_errno = errno;
96
0
          if (cd1 != (iconv_t)(-1))
97
0
            iconv_close (cd1);
98
0
          if (cd != (iconv_t)(-1))
99
0
            iconv_close (cd);
100
0
          errno = saved_errno;
101
0
          return -1;
102
0
        }
103
0
    }
104
105
0
  cdp->cd = cd;
106
0
  cdp->cd1 = cd1;
107
0
  cdp->cd2 = cd2;
108
0
  return 0;
109
0
}
110
111
int
112
iconveh_close (const iconveh_t *cd)
113
0
{
114
0
  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
115
0
    {
116
      /* Return -1, but preserve the errno from iconv_close.  */
117
0
      int saved_errno = errno;
118
0
      if (cd->cd1 != (iconv_t)(-1))
119
0
        iconv_close (cd->cd1);
120
0
      if (cd->cd != (iconv_t)(-1))
121
0
        iconv_close (cd->cd);
122
0
      errno = saved_errno;
123
0
      return -1;
124
0
    }
125
0
  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
126
0
    {
127
      /* Return -1, but preserve the errno from iconv_close.  */
128
0
      int saved_errno = errno;
129
0
      if (cd->cd != (iconv_t)(-1))
130
0
        iconv_close (cd->cd);
131
0
      errno = saved_errno;
132
0
      return -1;
133
0
    }
134
0
  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
135
0
    return -1;
136
0
  return 0;
137
0
}
138
139
/* iconv_carefully is like iconv, except that it stops as soon as it encounters
140
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
141
   it has incremented the input pointers past the error location.  */
142
# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
143
/* Irix iconv() inserts a NUL byte if it cannot convert.
144
   NetBSD iconv() inserts a question mark if it cannot convert.
145
   Only GNU libiconv and GNU libc are known to prefer to fail rather
146
   than doing a lossy conversion.  */
147
static size_t
148
iconv_carefully (iconv_t cd,
149
                 const char **inbuf, size_t *inbytesleft,
150
                 char **outbuf, size_t *outbytesleft,
151
                 bool *incremented)
152
{
153
  const char *inptr = *inbuf;
154
  const char *inptr_end = inptr + *inbytesleft;
155
  char *outptr = *outbuf;
156
  size_t outsize = *outbytesleft;
157
  const char *inptr_before;
158
  size_t res;
159
160
  do
161
    {
162
      size_t insize;
163
164
      inptr_before = inptr;
165
      res = (size_t)(-1);
166
167
      for (insize = 1; inptr + insize <= inptr_end; insize++)
168
        {
169
          res = iconv (cd,
170
                       (ICONV_CONST char **) &inptr, &insize,
171
                       &outptr, &outsize);
172
          if (!(res == (size_t)(-1) && errno == EINVAL))
173
            break;
174
          /* iconv can eat up a shift sequence but give EINVAL while attempting
175
             to convert the first character.  E.g. libiconv does this.  */
176
          if (inptr > inptr_before)
177
            {
178
              res = 0;
179
              break;
180
            }
181
        }
182
183
      if (res == 0)
184
        {
185
          *outbuf = outptr;
186
          *outbytesleft = outsize;
187
        }
188
    }
189
  while (res == 0 && inptr < inptr_end);
190
191
  *inbuf = inptr;
192
  *inbytesleft = inptr_end - inptr;
193
  if (res != (size_t)(-1) && res > 0)
194
    {
195
      /* iconv() has already incremented INPTR.  We cannot go back to a
196
         previous INPTR, otherwise the state inside CD would become invalid,
197
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
198
         *INBUF has already been incremented.  */
199
      *incremented = (inptr > inptr_before);
200
      errno = EILSEQ;
201
      return (size_t)(-1);
202
    }
203
  else
204
    {
205
      *incremented = false;
206
      return res;
207
    }
208
}
209
# else
210
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
211
0
     (*(incremented) = false, \
212
0
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
213
# endif
214
215
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
216
   converting one character or one shift sequence.  */
217
static size_t
218
iconv_carefully_1 (iconv_t cd,
219
                   const char **inbuf, size_t *inbytesleft,
220
                   char **outbuf, size_t *outbytesleft,
221
                   bool *incremented)
222
0
{
223
0
  const char *inptr_before = *inbuf;
224
0
  const char *inptr = inptr_before;
225
0
  const char *inptr_end = inptr_before + *inbytesleft;
226
0
  char *outptr = *outbuf;
227
0
  size_t outsize = *outbytesleft;
228
0
  size_t res = (size_t)(-1);
229
0
  size_t insize;
230
231
0
  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
232
0
    {
233
0
      inptr = inptr_before;
234
0
      res = iconv (cd,
235
0
                   (ICONV_CONST char **) &inptr, &insize,
236
0
                   &outptr, &outsize);
237
0
      if (!(res == (size_t)(-1) && errno == EINVAL))
238
0
        break;
239
      /* iconv can eat up a shift sequence but give EINVAL while attempting
240
         to convert the first character.  E.g. libiconv does this.  */
241
0
      if (inptr > inptr_before)
242
0
        {
243
0
          res = 0;
244
0
          break;
245
0
        }
246
0
    }
247
248
0
  *inbuf = inptr;
249
0
  *inbytesleft = inptr_end - inptr;
250
# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
251
  /* Irix iconv() inserts a NUL byte if it cannot convert.
252
     NetBSD iconv() inserts a question mark if it cannot convert.
253
     Only GNU libiconv and GNU libc are known to prefer to fail rather
254
     than doing a lossy conversion.  */
255
  if (res != (size_t)(-1) && res > 0)
256
    {
257
      /* iconv() has already incremented INPTR.  We cannot go back to a
258
         previous INPTR, otherwise the state inside CD would become invalid,
259
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
260
         *INBUF has already been incremented.  */
261
      *incremented = (inptr > inptr_before);
262
      errno = EILSEQ;
263
      return (size_t)(-1);
264
    }
265
# endif
266
267
0
  if (res != (size_t)(-1))
268
0
    {
269
0
      *outbuf = outptr;
270
0
      *outbytesleft = outsize;
271
0
    }
272
0
  *incremented = false;
273
0
  return res;
274
0
}
275
276
/* utf8conv_carefully is like iconv, except that
277
     - it converts from UTF-8 to UTF-8,
278
     - it stops as soon as it encounters a conversion error, and it returns
279
       in *INCREMENTED a boolean telling whether it has incremented the input
280
       pointers past the error location,
281
     - if one_character_only is true, it stops after converting one
282
       character.  */
283
static size_t
284
utf8conv_carefully (bool one_character_only,
285
                    const char **inbuf, size_t *inbytesleft,
286
                    char **outbuf, size_t *outbytesleft,
287
                    bool *incremented)
288
0
{
289
0
  const char *inptr = *inbuf;
290
0
  size_t insize = *inbytesleft;
291
0
  char *outptr = *outbuf;
292
0
  size_t outsize = *outbytesleft;
293
0
  size_t res;
294
295
0
  res = 0;
296
0
  do
297
0
    {
298
0
      ucs4_t uc;
299
0
      int n;
300
0
      int m;
301
302
0
      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
303
0
      if (n < 0)
304
0
        {
305
0
          errno = (n == -2 ? EINVAL : EILSEQ);
306
0
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
307
0
          inptr += n;
308
0
          insize -= n;
309
0
          res = (size_t)(-1);
310
0
          *incremented = true;
311
0
          break;
312
0
        }
313
0
      if (outsize == 0)
314
0
        {
315
0
          errno = E2BIG;
316
0
          res = (size_t)(-1);
317
0
          *incremented = false;
318
0
          break;
319
0
        }
320
0
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
321
0
      if (m == -2)
322
0
        {
323
0
          errno = E2BIG;
324
0
          res = (size_t)(-1);
325
0
          *incremented = false;
326
0
          break;
327
0
        }
328
0
      inptr += n;
329
0
      insize -= n;
330
0
      if (m == -1)
331
0
        {
332
0
          errno = EILSEQ;
333
0
          res = (size_t)(-1);
334
0
          *incremented = true;
335
0
          break;
336
0
        }
337
0
      outptr += m;
338
0
      outsize -= m;
339
0
    }
340
0
  while (!one_character_only && insize > 0);
341
342
0
  *inbuf = inptr;
343
0
  *inbytesleft = insize;
344
0
  *outbuf = outptr;
345
0
  *outbytesleft = outsize;
346
0
  return res;
347
0
}
348
349
static int
350
mem_cd_iconveh_internal (const char *src, size_t srclen,
351
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
352
                         enum iconv_ilseq_handler handler,
353
                         size_t extra_alloc,
354
                         size_t *offsets,
355
                         char **resultp, size_t *lengthp)
356
0
{
357
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
358
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
359
     Instead, we have to start afresh from the beginning of SRC.  */
360
  /* Use a temporary buffer, so that for small strings, a single malloc()
361
     call will be sufficient.  */
362
0
# define tmpbufsize 4096
363
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
364
     libiconv's UCS-4-INTERNAL encoding.  */
365
0
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
366
0
# define tmpbuf tmp.buf
367
368
0
  char *initial_result;
369
0
  char *result;
370
0
  size_t allocated;
371
0
  size_t length;
372
0
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
373
374
0
  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
375
0
    {
376
0
      initial_result = *resultp;
377
0
      allocated = *lengthp;
378
0
    }
379
0
  else
380
0
    {
381
0
      initial_result = tmpbuf;
382
0
      allocated = sizeof (tmpbuf);
383
0
    }
384
0
  result = initial_result;
385
386
  /* Test whether a direct conversion is possible at all.  */
387
0
  if (cd == (iconv_t)(-1))
388
0
    goto indirectly;
389
390
0
  if (offsets != NULL)
391
0
    {
392
0
      size_t i;
393
394
0
      for (i = 0; i < srclen; i++)
395
0
        offsets[i] = (size_t)(-1);
396
397
0
      last_length = (size_t)(-1);
398
0
    }
399
0
  length = 0;
400
401
  /* First, try a direct conversion, and see whether a conversion error
402
     occurs at all.  */
403
0
  {
404
0
    const char *inptr = src;
405
0
    size_t insize = srclen;
406
407
    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
408
0
# if defined _LIBICONV_VERSION \
409
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
410
0
          || defined __sun)
411
    /* Set to the initial state.  */
412
0
    iconv (cd, NULL, NULL, NULL, NULL);
413
0
# endif
414
415
0
    while (insize > 0)
416
0
      {
417
0
        char *outptr = result + length;
418
0
        size_t outsize = allocated - extra_alloc - length;
419
0
        bool incremented;
420
0
        size_t res;
421
0
        bool grow;
422
423
0
        if (offsets != NULL)
424
0
          {
425
0
            if (length != last_length) /* ensure that offset[] be increasing */
426
0
              {
427
0
                offsets[inptr - src] = length;
428
0
                last_length = length;
429
0
              }
430
0
            res = iconv_carefully_1 (cd,
431
0
                                     &inptr, &insize,
432
0
                                     &outptr, &outsize,
433
0
                                     &incremented);
434
0
          }
435
0
        else
436
          /* Use iconv_carefully instead of iconv here, because:
437
             - If TO_CODESET is UTF-8, we can do the error handling in this
438
               loop, no need for a second loop,
439
             - With iconv() implementations other than GNU libiconv and GNU
440
               libc, if we use iconv() in a big swoop, checking for an E2BIG
441
               return, we lose the number of irreversible conversions.  */
442
0
          res = iconv_carefully (cd,
443
0
                                 &inptr, &insize,
444
0
                                 &outptr, &outsize,
445
0
                                 &incremented);
446
447
0
        length = outptr - result;
448
0
        grow = (length + extra_alloc > allocated / 2);
449
0
        if (res == (size_t)(-1))
450
0
          {
451
0
            if (errno == E2BIG)
452
0
              grow = true;
453
0
            else if (errno == EINVAL)
454
0
              break;
455
0
            else if (errno == EILSEQ && handler != iconveh_error)
456
0
              {
457
0
                if (cd2 == (iconv_t)(-1))
458
0
                  {
459
                    /* TO_CODESET is UTF-8.  */
460
                    /* Error handling can produce up to 1 or 3 bytes of
461
                       output.  */
462
0
                    size_t extra_need =
463
0
                      (handler == iconveh_replacement_character ? 3 : 1);
464
0
                    if (length + extra_need + extra_alloc > allocated)
465
0
                      {
466
0
                        char *memory;
467
468
0
                        allocated = 2 * allocated;
469
0
                        if (length + extra_need + extra_alloc > allocated)
470
0
                          allocated = 2 * allocated;
471
0
                        if (length + extra_need + extra_alloc > allocated)
472
0
                          abort ();
473
0
                        if (result == initial_result)
474
0
                          memory = (char *) malloc (allocated);
475
0
                        else
476
0
                          memory = (char *) realloc (result, allocated);
477
0
                        if (memory == NULL)
478
0
                          {
479
0
                            if (result != initial_result)
480
0
                              free (result);
481
0
                            errno = ENOMEM;
482
0
                            return -1;
483
0
                          }
484
0
                        if (result == initial_result)
485
0
                          memcpy (memory, initial_result, length);
486
0
                        result = memory;
487
0
                        grow = false;
488
0
                      }
489
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
490
                       and emit a replacement character or a question mark.  */
491
0
                    if (!incremented)
492
0
                      {
493
0
                        if (insize == 0)
494
0
                          abort ();
495
0
                        inptr++;
496
0
                        insize--;
497
0
                      }
498
0
                    if (handler == iconveh_replacement_character)
499
0
                      {
500
                        /* U+FFFD in UTF-8 encoding.  */
501
0
                        result[length+0] = '\357';
502
0
                        result[length+1] = '\277';
503
0
                        result[length+2] = '\275';
504
0
                        length += 3;
505
0
                      }
506
0
                    else
507
0
                      {
508
0
                        result[length] = '?';
509
0
                        length++;
510
0
                      }
511
0
                  }
512
0
                else
513
0
                  goto indirectly;
514
0
              }
515
0
            else
516
0
              {
517
0
                if (result != initial_result)
518
0
                  free (result);
519
0
                return -1;
520
0
              }
521
0
          }
522
0
        if (insize == 0)
523
0
          break;
524
0
        if (grow)
525
0
          {
526
0
            char *memory;
527
528
0
            allocated = 2 * allocated;
529
0
            if (result == initial_result)
530
0
              memory = (char *) malloc (allocated);
531
0
            else
532
0
              memory = (char *) realloc (result, allocated);
533
0
            if (memory == NULL)
534
0
              {
535
0
                if (result != initial_result)
536
0
                  free (result);
537
0
                errno = ENOMEM;
538
0
                return -1;
539
0
              }
540
0
            if (result == initial_result)
541
0
              memcpy (memory, initial_result, length);
542
0
            result = memory;
543
0
          }
544
0
      }
545
0
  }
546
547
  /* Now get the conversion state back to the initial state.
548
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
549
0
#if defined _LIBICONV_VERSION \
550
0
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
551
0
         || defined __sun)
552
0
  for (;;)
553
0
    {
554
0
      char *outptr = result + length;
555
0
      size_t outsize = allocated - extra_alloc - length;
556
0
      size_t res;
557
558
0
      res = iconv (cd, NULL, NULL, &outptr, &outsize);
559
0
      length = outptr - result;
560
0
      if (res == (size_t)(-1))
561
0
        {
562
0
          if (errno == E2BIG)
563
0
            {
564
0
              char *memory;
565
566
0
              allocated = 2 * allocated;
567
0
              if (result == initial_result)
568
0
                memory = (char *) malloc (allocated);
569
0
              else
570
0
                memory = (char *) realloc (result, allocated);
571
0
              if (memory == NULL)
572
0
                {
573
0
                  if (result != initial_result)
574
0
                    free (result);
575
0
                  errno = ENOMEM;
576
0
                  return -1;
577
0
                }
578
0
              if (result == initial_result)
579
0
                memcpy (memory, initial_result, length);
580
0
              result = memory;
581
0
            }
582
0
          else
583
0
            {
584
0
              if (result != initial_result)
585
0
                free (result);
586
0
              return -1;
587
0
            }
588
0
        }
589
0
      else
590
0
        break;
591
0
    }
592
0
#endif
593
594
  /* The direct conversion succeeded.  */
595
0
  goto done;
596
597
0
 indirectly:
598
  /* The direct conversion failed.
599
     Use a conversion through UTF-8.  */
600
0
  if (offsets != NULL)
601
0
    {
602
0
      size_t i;
603
604
0
      for (i = 0; i < srclen; i++)
605
0
        offsets[i] = (size_t)(-1);
606
607
0
      last_length = (size_t)(-1);
608
0
    }
609
0
  length = 0;
610
0
  {
611
0
    const bool slowly = (offsets != NULL || handler == iconveh_error);
612
0
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
613
0
    char utf8buf[utf8bufsize + 3];
614
0
    size_t utf8len = 0;
615
0
    const char *in1ptr = src;
616
0
    size_t in1size = srclen;
617
0
    bool do_final_flush1 = true;
618
0
    bool do_final_flush2 = true;
619
620
    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
621
0
# if defined _LIBICONV_VERSION \
622
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
623
0
          || defined __sun)
624
    /* Set to the initial state.  */
625
0
    if (cd1 != (iconv_t)(-1))
626
0
      iconv (cd1, NULL, NULL, NULL, NULL);
627
0
    if (cd2 != (iconv_t)(-1))
628
0
      iconv (cd2, NULL, NULL, NULL, NULL);
629
0
# endif
630
631
0
    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
632
0
      {
633
0
        char *out1ptr = utf8buf + utf8len;
634
0
        size_t out1size = utf8bufsize - utf8len;
635
0
        bool incremented1;
636
0
        size_t res1;
637
0
        int errno1;
638
639
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
640
0
        if (in1size > 0)
641
0
          {
642
0
            if (offsets != NULL
643
0
                && length != last_length) /* ensure that offset[] be increasing */
644
0
              {
645
0
                offsets[in1ptr - src] = length;
646
0
                last_length = length;
647
0
              }
648
0
            if (cd1 != (iconv_t)(-1))
649
0
              {
650
0
                if (slowly)
651
0
                  res1 = iconv_carefully_1 (cd1,
652
0
                                            &in1ptr, &in1size,
653
0
                                            &out1ptr, &out1size,
654
0
                                            &incremented1);
655
0
                else
656
0
                  res1 = iconv_carefully (cd1,
657
0
                                          &in1ptr, &in1size,
658
0
                                          &out1ptr, &out1size,
659
0
                                          &incremented1);
660
0
              }
661
0
            else
662
0
              {
663
                /* FROM_CODESET is UTF-8.  */
664
0
                res1 = utf8conv_carefully (slowly,
665
0
                                           &in1ptr, &in1size,
666
0
                                           &out1ptr, &out1size,
667
0
                                           &incremented1);
668
0
              }
669
0
          }
670
0
        else if (do_final_flush1)
671
0
          {
672
            /* Now get the conversion state of CD1 back to the initial state.
673
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
674
0
# if defined _LIBICONV_VERSION \
675
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
676
0
          || defined __sun)
677
0
            if (cd1 != (iconv_t)(-1))
678
0
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
679
0
            else
680
0
# endif
681
0
              res1 = 0;
682
0
            do_final_flush1 = false;
683
0
            incremented1 = true;
684
0
          }
685
0
        else
686
0
          {
687
0
            res1 = 0;
688
0
            incremented1 = true;
689
0
          }
690
0
        if (res1 == (size_t)(-1)
691
0
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
692
0
          {
693
0
            if (result != initial_result)
694
0
              free (result);
695
0
            return -1;
696
0
          }
697
0
        if (res1 == (size_t)(-1)
698
0
            && errno == EILSEQ && handler != iconveh_error)
699
0
          {
700
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
701
               emit a U+FFFD character or a question mark.  Room for this
702
               character was allocated at the end of utf8buf.  */
703
0
            if (!incremented1)
704
0
              {
705
0
                if (in1size == 0)
706
0
                  abort ();
707
0
                in1ptr++;
708
0
                in1size--;
709
0
              }
710
0
            if (handler == iconveh_replacement_character)
711
0
              {
712
                /* U+FFFD in UTF-8 encoding.  */
713
0
                out1ptr[0] = '\357';
714
0
                out1ptr[1] = '\277';
715
0
                out1ptr[2] = '\275';
716
0
                out1ptr += 3;
717
0
              }
718
0
            else
719
0
              *out1ptr++ = '?';
720
0
            res1 = 0;
721
0
          }
722
0
        errno1 = errno;
723
0
        utf8len = out1ptr - utf8buf;
724
725
0
        if (offsets != NULL
726
0
            || in1size == 0
727
0
            || utf8len > utf8bufsize / 2
728
0
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
729
0
          {
730
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
731
0
            const char *in2ptr = utf8buf;
732
0
            size_t in2size = utf8len;
733
734
0
            while (in2size > 0
735
0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
736
0
              {
737
0
                char *out2ptr = result + length;
738
0
                size_t out2size = allocated - extra_alloc - length;
739
0
                bool incremented2;
740
0
                size_t res2;
741
0
                bool grow;
742
743
0
                if (in2size > 0)
744
0
                  {
745
0
                    if (cd2 != (iconv_t)(-1))
746
0
                      res2 = iconv_carefully (cd2,
747
0
                                              &in2ptr, &in2size,
748
0
                                              &out2ptr, &out2size,
749
0
                                              &incremented2);
750
0
                    else
751
                      /* TO_CODESET is UTF-8.  */
752
0
                      res2 = utf8conv_carefully (false,
753
0
                                                 &in2ptr, &in2size,
754
0
                                                 &out2ptr, &out2size,
755
0
                                                 &incremented2);
756
0
                  }
757
0
                else /* in1size == 0 && !do_final_flush1
758
                        && in2size == 0 && do_final_flush2 */
759
0
                  {
760
                    /* Now get the conversion state of CD1 back to the initial
761
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
762
0
# if defined _LIBICONV_VERSION \
763
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
764
0
          || defined __sun)
765
0
                    if (cd2 != (iconv_t)(-1))
766
0
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
767
0
                    else
768
0
# endif
769
0
                      res2 = 0;
770
0
                    do_final_flush2 = false;
771
0
                    incremented2 = true;
772
0
                  }
773
774
0
                length = out2ptr - result;
775
0
                grow = (length + extra_alloc > allocated / 2);
776
0
                if (res2 == (size_t)(-1))
777
0
                  {
778
0
                    if (errno == E2BIG)
779
0
                      grow = true;
780
0
                    else if (errno == EINVAL)
781
0
                      break;
782
0
                    else if (errno == EILSEQ && handler != iconveh_error)
783
0
                      {
784
                        /* Error handling can produce up to 10 bytes of UTF-8
785
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
786
                           UCS-4, so use CD2 here as well.  */
787
0
                        char scratchbuf[10];
788
0
                        size_t scratchlen;
789
0
                        ucs4_t uc;
790
0
                        const char *inptr;
791
0
                        size_t insize;
792
0
                        size_t res;
793
794
0
                        if (incremented2)
795
0
                          {
796
0
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
797
0
                                         (const uint8_t *) utf8buf)
798
0
                                == NULL)
799
0
                              abort ();
800
0
                          }
801
0
                        else
802
0
                          {
803
0
                            int n;
804
0
                            if (in2size == 0)
805
0
                              abort ();
806
0
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
807
0
                                                  in2size);
808
0
                            in2ptr += n;
809
0
                            in2size -= n;
810
0
                          }
811
812
0
                        if (handler == iconveh_escape_sequence)
813
0
                          {
814
0
                            static char hex[16] = "0123456789ABCDEF";
815
0
                            scratchlen = 0;
816
0
                            scratchbuf[scratchlen++] = '\\';
817
0
                            if (uc < 0x10000)
818
0
                              scratchbuf[scratchlen++] = 'u';
819
0
                            else
820
0
                              {
821
0
                                scratchbuf[scratchlen++] = 'U';
822
0
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
823
0
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
824
0
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
825
0
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
826
0
                              }
827
0
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
828
0
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
829
0
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
830
0
                            scratchbuf[scratchlen++] = hex[uc & 15];
831
0
                          }
832
0
                        else if (handler == iconveh_replacement_character)
833
0
                          {
834
                            /* U+FFFD in UTF-8 encoding.  */
835
0
                            scratchbuf[0] = '\357';
836
0
                            scratchbuf[1] = '\277';
837
0
                            scratchbuf[2] = '\275';
838
0
                            scratchlen = 3;
839
0
                          }
840
0
                        else
841
0
                          {
842
0
                            scratchbuf[0] = '?';
843
0
                            scratchlen = 1;
844
0
                          }
845
846
0
                        inptr = scratchbuf;
847
0
                        insize = scratchlen;
848
0
                        if (cd2 != (iconv_t)(-1))
849
0
                          {
850
0
                            char *out2ptr_try = out2ptr;
851
0
                            size_t out2size_try = out2size;
852
0
                            res = iconv (cd2,
853
0
                                         (ICONV_CONST char **) &inptr, &insize,
854
0
                                         &out2ptr_try, &out2size_try);
855
0
                            if (handler == iconveh_replacement_character
856
0
                                && (res == (size_t)(-1)
857
0
                                    ? errno == EILSEQ
858
                                    /* FreeBSD iconv(), NetBSD iconv(), and
859
                                       Solaris 11 iconv() insert a '?' if they
860
                                       cannot convert.  This is what we want.
861
                                       But IRIX iconv() inserts a NUL byte if it
862
                                       cannot convert.
863
                                       And musl libc iconv() inserts a '*' if it
864
                                       cannot convert.  */
865
0
                                    : (res > 0
866
0
                                       && !(out2ptr_try - out2ptr == 1
867
0
                                            && *out2ptr == '?'))))
868
0
                              {
869
                                /* The iconv() call failed.
870
                                   U+FFFD can't be converted to TO_CODESET.
871
                                   Use '?' instead.  */
872
0
                                scratchbuf[0] = '?';
873
0
                                scratchlen = 1;
874
0
                                inptr = scratchbuf;
875
0
                                insize = scratchlen;
876
0
                                res = iconv (cd2,
877
0
                                             (ICONV_CONST char **) &inptr, &insize,
878
0
                                             &out2ptr, &out2size);
879
0
                              }
880
0
                            else
881
0
                              {
882
                                /* Accept the results of the iconv() call.  */
883
0
                                out2ptr = out2ptr_try;
884
0
                                out2size = out2size_try;
885
0
                                res = 0;
886
0
                              }
887
0
                          }
888
0
                        else
889
0
                          {
890
                            /* TO_CODESET is UTF-8.  */
891
0
                            if (out2size >= insize)
892
0
                              {
893
0
                                memcpy (out2ptr, inptr, insize);
894
0
                                out2ptr += insize;
895
0
                                out2size -= insize;
896
0
                                inptr += insize;
897
0
                                insize = 0;
898
0
                                res = 0;
899
0
                              }
900
0
                            else
901
0
                              {
902
0
                                errno = E2BIG;
903
0
                                res = (size_t)(-1);
904
0
                              }
905
0
                          }
906
0
                        length = out2ptr - result;
907
0
                        if (res == (size_t)(-1) && errno == E2BIG)
908
0
                          {
909
0
                            char *memory;
910
911
0
                            allocated = 2 * allocated;
912
0
                            if (length + 1 + extra_alloc > allocated)
913
0
                              abort ();
914
0
                            if (result == initial_result)
915
0
                              memory = (char *) malloc (allocated);
916
0
                            else
917
0
                              memory = (char *) realloc (result, allocated);
918
0
                            if (memory == NULL)
919
0
                              {
920
0
                                if (result != initial_result)
921
0
                                  free (result);
922
0
                                errno = ENOMEM;
923
0
                                return -1;
924
0
                              }
925
0
                            if (result == initial_result)
926
0
                              memcpy (memory, initial_result, length);
927
0
                            result = memory;
928
0
                            grow = false;
929
930
0
                            out2ptr = result + length;
931
0
                            out2size = allocated - extra_alloc - length;
932
0
                            if (cd2 != (iconv_t)(-1))
933
0
                              res = iconv (cd2,
934
0
                                           (ICONV_CONST char **) &inptr,
935
0
                                           &insize,
936
0
                                           &out2ptr, &out2size);
937
0
                            else
938
0
                              {
939
                                /* TO_CODESET is UTF-8.  */
940
0
                                if (!(out2size >= insize))
941
0
                                  abort ();
942
0
                                memcpy (out2ptr, inptr, insize);
943
0
                                out2ptr += insize;
944
0
                                out2size -= insize;
945
0
                                inptr += insize;
946
0
                                insize = 0;
947
0
                                res = 0;
948
0
                              }
949
0
                            length = out2ptr - result;
950
0
                          }
951
# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
952
                        /* IRIX iconv() inserts a NUL byte if it cannot convert.
953
                           FreeBSD iconv(), NetBSD iconv(), and Solaris 11
954
                           iconv() insert a '?' if they cannot convert.
955
                           musl libc iconv() inserts a '*' if it cannot convert.
956
                           Only GNU libiconv and GNU libc are known to prefer
957
                           to fail rather than doing a lossy conversion.  */
958
                        if (res != (size_t)(-1) && res > 0)
959
                          {
960
                            errno = EILSEQ;
961
                            res = (size_t)(-1);
962
                          }
963
# endif
964
0
                        if (res == (size_t)(-1))
965
0
                          {
966
                            /* Failure converting the ASCII replacement.  */
967
0
                            if (result != initial_result)
968
0
                              free (result);
969
0
                            return -1;
970
0
                          }
971
0
                      }
972
0
                    else
973
0
                      {
974
0
                        if (result != initial_result)
975
0
                          free (result);
976
0
                        return -1;
977
0
                      }
978
0
                  }
979
0
                if (!(in2size > 0
980
0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
981
0
                  break;
982
0
                if (grow)
983
0
                  {
984
0
                    char *memory;
985
986
0
                    allocated = 2 * allocated;
987
0
                    if (result == initial_result)
988
0
                      memory = (char *) malloc (allocated);
989
0
                    else
990
0
                      memory = (char *) realloc (result, allocated);
991
0
                    if (memory == NULL)
992
0
                      {
993
0
                        if (result != initial_result)
994
0
                          free (result);
995
0
                        errno = ENOMEM;
996
0
                        return -1;
997
0
                      }
998
0
                    if (result == initial_result)
999
0
                      memcpy (memory, initial_result, length);
1000
0
                    result = memory;
1001
0
                  }
1002
0
              }
1003
1004
            /* Move the remaining bytes to the beginning of utf8buf.  */
1005
0
            if (in2size > 0)
1006
0
              memmove (utf8buf, in2ptr, in2size);
1007
0
            utf8len = in2size;
1008
0
          }
1009
1010
0
        if (res1 == (size_t)(-1))
1011
0
          {
1012
0
            if (errno1 == EINVAL)
1013
0
              in1size = 0;
1014
0
            else if (errno1 == EILSEQ)
1015
0
              {
1016
0
                if (result != initial_result)
1017
0
                  free (result);
1018
0
                errno = errno1;
1019
0
                return -1;
1020
0
              }
1021
0
          }
1022
0
      }
1023
0
# undef utf8bufsize
1024
0
  }
1025
1026
0
 done:
1027
  /* Now the final memory allocation.  */
1028
0
  if (result == tmpbuf)
1029
0
    {
1030
0
      size_t memsize = length + extra_alloc;
1031
1032
0
      if (*resultp != NULL && *lengthp >= memsize)
1033
0
        result = *resultp;
1034
0
      else
1035
0
        {
1036
0
          char *memory;
1037
1038
0
          memory = (char *) malloc (memsize > 0 ? memsize : 1);
1039
0
          if (memory != NULL)
1040
0
            result = memory;
1041
0
          else
1042
0
            {
1043
0
              errno = ENOMEM;
1044
0
              return -1;
1045
0
            }
1046
0
        }
1047
0
      memcpy (result, tmpbuf, length);
1048
0
    }
1049
0
  else if (result != *resultp && length + extra_alloc < allocated)
1050
0
    {
1051
      /* Shrink the allocated memory if possible.  */
1052
0
      size_t memsize = length + extra_alloc;
1053
0
      char *memory;
1054
1055
0
      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1056
0
      if (memory != NULL)
1057
0
        result = memory;
1058
0
    }
1059
0
  *resultp = result;
1060
0
  *lengthp = length;
1061
0
  return 0;
1062
0
# undef tmpbuf
1063
0
# undef tmpbufsize
1064
0
}
1065
1066
int
1067
mem_cd_iconveh (const char *src, size_t srclen,
1068
                const iconveh_t *cd,
1069
                enum iconv_ilseq_handler handler,
1070
                size_t *offsets,
1071
                char **resultp, size_t *lengthp)
1072
0
{
1073
0
  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1074
0
                                  handler, 0, offsets, resultp, lengthp);
1075
0
}
1076
1077
char *
1078
str_cd_iconveh (const char *src,
1079
                const iconveh_t *cd,
1080
                enum iconv_ilseq_handler handler)
1081
0
{
1082
  /* For most encodings, a trailing NUL byte in the input will be converted
1083
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1084
     function is usable for UTF-7, we have to exclude the NUL byte from the
1085
     conversion and add it by hand afterwards.  */
1086
0
  char *result = NULL;
1087
0
  size_t length = 0;
1088
0
  int retval = mem_cd_iconveh_internal (src, strlen (src),
1089
0
                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
1090
0
                                        NULL, &result, &length);
1091
1092
0
  if (retval < 0)
1093
0
    {
1094
0
      free (result);
1095
0
      return NULL;
1096
0
    }
1097
1098
  /* Add the terminating NUL byte.  */
1099
0
  result[length] = '\0';
1100
1101
0
  return result;
1102
0
}
1103
1104
#endif
1105
1106
int
1107
mem_iconveh (const char *src, size_t srclen,
1108
             const char *from_codeset, const char *to_codeset,
1109
             enum iconv_ilseq_handler handler,
1110
             size_t *offsets,
1111
             char **resultp, size_t *lengthp)
1112
0
{
1113
0
  if (srclen == 0)
1114
0
    {
1115
      /* Nothing to convert.  */
1116
0
      *lengthp = 0;
1117
0
      return 0;
1118
0
    }
1119
0
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1120
0
    {
1121
0
      char *result;
1122
1123
0
      if (*resultp != NULL && *lengthp >= srclen)
1124
0
        result = *resultp;
1125
0
      else
1126
0
        {
1127
0
          result = (char *) malloc (srclen);
1128
0
          if (result == NULL)
1129
0
            {
1130
0
              errno = ENOMEM;
1131
0
              return -1;
1132
0
            }
1133
0
        }
1134
0
      memcpy (result, src, srclen);
1135
0
      *resultp = result;
1136
0
      *lengthp = srclen;
1137
0
      return 0;
1138
0
    }
1139
0
  else
1140
0
    {
1141
0
#if HAVE_ICONV
1142
0
      iconveh_t cd;
1143
0
      char *result;
1144
0
      size_t length;
1145
0
      int retval;
1146
1147
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1148
0
        return -1;
1149
1150
0
      result = *resultp;
1151
0
      length = *lengthp;
1152
0
      retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1153
0
                               &result, &length);
1154
1155
0
      if (retval < 0)
1156
0
        {
1157
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1158
0
          int saved_errno = errno;
1159
0
          iconveh_close (&cd);
1160
0
          errno = saved_errno;
1161
0
        }
1162
0
      else
1163
0
        {
1164
0
          if (iconveh_close (&cd) < 0)
1165
0
            {
1166
0
              if (result != *resultp)
1167
0
                free (result);
1168
0
              return -1;
1169
0
            }
1170
0
          *resultp = result;
1171
0
          *lengthp = length;
1172
0
        }
1173
0
      return retval;
1174
#else
1175
      /* This is a different error code than if iconv_open existed but didn't
1176
         support from_codeset and to_codeset, so that the caller can emit
1177
         an error message such as
1178
           "iconv() is not supported. Installing GNU libiconv and
1179
            then reinstalling this package would fix this."  */
1180
      errno = ENOSYS;
1181
      return -1;
1182
#endif
1183
0
    }
1184
0
}
1185
1186
char *
1187
str_iconveh (const char *src,
1188
             const char *from_codeset, const char *to_codeset,
1189
             enum iconv_ilseq_handler handler)
1190
0
{
1191
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1192
0
    {
1193
0
      char *result = strdup (src);
1194
1195
0
      if (result == NULL)
1196
0
        errno = ENOMEM;
1197
0
      return result;
1198
0
    }
1199
0
  else
1200
0
    {
1201
0
#if HAVE_ICONV
1202
0
      iconveh_t cd;
1203
0
      char *result;
1204
1205
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1206
0
        return NULL;
1207
1208
0
      result = str_cd_iconveh (src, &cd, handler);
1209
1210
0
      if (result == NULL)
1211
0
        {
1212
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1213
0
          int saved_errno = errno;
1214
0
          iconveh_close (&cd);
1215
0
          errno = saved_errno;
1216
0
        }
1217
0
      else
1218
0
        {
1219
0
          if (iconveh_close (&cd) < 0)
1220
0
            {
1221
0
              free (result);
1222
0
              return NULL;
1223
0
            }
1224
0
        }
1225
0
      return result;
1226
#else
1227
      /* This is a different error code than if iconv_open existed but didn't
1228
         support from_codeset and to_codeset, so that the caller can emit
1229
         an error message such as
1230
           "iconv() is not supported. Installing GNU libiconv and
1231
            then reinstalling this package would fix this."  */
1232
      errno = ENOSYS;
1233
      return NULL;
1234
#endif
1235
0
    }
1236
0
}