Coverage Report

Created: 2025-03-18 06:55

/src/libunistring/lib/striconveh.c
Line
Count
Source (jump to first uncovered line)
1
/* Character set conversion with error handling.
2
   Copyright (C) 2001-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible and Simon Josefsson.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
#include <config.h>
19
20
/* Specification.  */
21
#include "striconveh.h"
22
23
#include <errno.h>
24
#include <stdlib.h>
25
#include <string.h>
26
27
#if HAVE_ICONV
28
# include <iconv.h>
29
# include "unistr.h"
30
#endif
31
32
#include "c-strcase.h"
33
#include "c-strcaseeq.h"
34
35
#ifndef SIZE_MAX
36
# define SIZE_MAX ((size_t) -1)
37
#endif
38
39
40
#if HAVE_ICONV
41
42
/* The caller must provide an iconveh_t, not just an iconv_t, because when a
43
   conversion error occurs, we may have to determine the Unicode representation
44
   of the inconvertible character.  */
45
46
int
47
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
48
0
{
49
0
  iconv_t cd;
50
0
  iconv_t cd1;
51
0
  iconv_t cd2;
52
53
0
  cd = iconv_open (to_codeset, from_codeset);
54
55
0
  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
56
0
    cd1 = (iconv_t)(-1);
57
0
  else
58
0
    {
59
0
      cd1 = iconv_open ("UTF-8", from_codeset);
60
0
      if (cd1 == (iconv_t)(-1))
61
0
        {
62
0
          int saved_errno = errno;
63
0
          if (cd != (iconv_t)(-1))
64
0
            iconv_close (cd);
65
0
          errno = saved_errno;
66
0
          return -1;
67
0
        }
68
0
    }
69
70
0
  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
71
0
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
72
0
      && !defined __UCLIBC__) \
73
0
     || _LIBICONV_VERSION >= 0x0105 \
74
0
     || defined ICONV_SET_TRANSLITERATE
75
0
      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
76
0
# endif
77
0
     )
78
0
    cd2 = (iconv_t)(-1);
79
0
  else
80
0
    {
81
0
      cd2 = iconv_open (to_codeset, "UTF-8");
82
0
      if (cd2 == (iconv_t)(-1))
83
0
        {
84
0
          int saved_errno = errno;
85
0
          if (cd1 != (iconv_t)(-1))
86
0
            iconv_close (cd1);
87
0
          if (cd != (iconv_t)(-1))
88
0
            iconv_close (cd);
89
0
          errno = saved_errno;
90
0
          return -1;
91
0
        }
92
0
    }
93
94
0
  cdp->cd = cd;
95
0
  cdp->cd1 = cd1;
96
0
  cdp->cd2 = cd2;
97
0
  return 0;
98
0
}
99
100
int
101
iconveh_close (const iconveh_t *cd)
102
0
{
103
0
  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
104
0
    {
105
      /* Return -1, but preserve the errno from iconv_close.  */
106
0
      int saved_errno = errno;
107
0
      if (cd->cd1 != (iconv_t)(-1))
108
0
        iconv_close (cd->cd1);
109
0
      if (cd->cd != (iconv_t)(-1))
110
0
        iconv_close (cd->cd);
111
0
      errno = saved_errno;
112
0
      return -1;
113
0
    }
114
0
  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
115
0
    {
116
      /* Return -1, but preserve the errno from iconv_close.  */
117
0
      int saved_errno = errno;
118
0
      if (cd->cd != (iconv_t)(-1))
119
0
        iconv_close (cd->cd);
120
0
      errno = saved_errno;
121
0
      return -1;
122
0
    }
123
0
  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
124
0
    return -1;
125
0
  return 0;
126
0
}
127
128
/* iconv_carefully is like iconv, except that it stops as soon as it encounters
129
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
130
   it has incremented the input pointers past the error location.  */
131
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
132
     && !(defined __GLIBC__ && !defined __UCLIBC__)
133
/* Irix iconv() inserts a NUL byte if it cannot convert.
134
   NetBSD iconv() inserts a question mark if it cannot convert.
135
   Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
136
   known to prefer to fail rather than doing a lossy conversion.  */
137
static size_t
138
iconv_carefully (iconv_t cd,
139
                 const char **inbuf, size_t *inbytesleft,
140
                 char **outbuf, size_t *outbytesleft,
141
                 bool *incremented)
142
{
143
  const char *inptr = *inbuf;
144
  const char *inptr_end = inptr + *inbytesleft;
145
  char *outptr = *outbuf;
146
  size_t outsize = *outbytesleft;
147
  const char *inptr_before;
148
  size_t res;
149
150
  do
151
    {
152
      size_t insize;
153
154
      inptr_before = inptr;
155
      res = (size_t)(-1);
156
157
      for (insize = 1; inptr + insize <= inptr_end; insize++)
158
        {
159
          res = iconv (cd,
160
                       (ICONV_CONST char **) &inptr, &insize,
161
                       &outptr, &outsize);
162
          if (!(res == (size_t)(-1) && errno == EINVAL))
163
            break;
164
          /* iconv can eat up a shift sequence but give EINVAL while attempting
165
             to convert the first character.  E.g. libiconv does this.  */
166
          if (inptr > inptr_before)
167
            {
168
              res = 0;
169
              break;
170
            }
171
        }
172
173
      if (res == 0)
174
        {
175
          *outbuf = outptr;
176
          *outbytesleft = outsize;
177
        }
178
    }
179
  while (res == 0 && inptr < inptr_end);
180
181
  *inbuf = inptr;
182
  *inbytesleft = inptr_end - inptr;
183
  if (res != (size_t)(-1) && res > 0)
184
    {
185
      /* iconv() has already incremented INPTR.  We cannot go back to a
186
         previous INPTR, otherwise the state inside CD would become invalid,
187
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
188
         *INBUF has already been incremented.  */
189
      *incremented = (inptr > inptr_before);
190
      errno = EILSEQ;
191
      return (size_t)(-1);
192
    }
193
  else
194
    {
195
      *incremented = false;
196
      return res;
197
    }
198
}
199
# else
200
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
201
0
     (*(incremented) = false, \
202
0
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
203
# endif
204
205
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
206
   converting one character or one shift sequence.  */
207
static size_t
208
iconv_carefully_1 (iconv_t cd,
209
                   const char **inbuf, size_t *inbytesleft,
210
                   char **outbuf, size_t *outbytesleft,
211
                   bool *incremented)
212
0
{
213
0
  const char *inptr_before = *inbuf;
214
0
  const char *inptr = inptr_before;
215
0
  const char *inptr_end = inptr_before + *inbytesleft;
216
0
  char *outptr = *outbuf;
217
0
  size_t outsize = *outbytesleft;
218
0
  size_t res = (size_t)(-1);
219
0
  size_t insize;
220
221
0
  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
222
0
    {
223
0
      inptr = inptr_before;
224
0
      res = iconv (cd,
225
0
                   (ICONV_CONST char **) &inptr, &insize,
226
0
                   &outptr, &outsize);
227
0
      if (!(res == (size_t)(-1) && errno == EINVAL))
228
0
        break;
229
      /* iconv can eat up a shift sequence but give EINVAL while attempting
230
         to convert the first character.  E.g. libiconv does this.  */
231
0
      if (inptr > inptr_before)
232
0
        {
233
0
          res = 0;
234
0
          break;
235
0
        }
236
0
    }
237
238
0
  *inbuf = inptr;
239
0
  *inbytesleft = inptr_end - inptr;
240
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
241
     && !(defined __GLIBC__ && !defined __UCLIBC__)
242
  /* Irix iconv() inserts a NUL byte if it cannot convert.
243
     NetBSD iconv() inserts a question mark if it cannot convert.
244
     Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
245
     known to prefer to fail rather than doing a lossy conversion.  */
246
  if (res != (size_t)(-1) && res > 0)
247
    {
248
      /* iconv() has already incremented INPTR.  We cannot go back to a
249
         previous INPTR, otherwise the state inside CD would become invalid,
250
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
251
         *INBUF has already been incremented.  */
252
      *incremented = (inptr > inptr_before);
253
      errno = EILSEQ;
254
      return (size_t)(-1);
255
    }
256
# endif
257
258
0
  if (res != (size_t)(-1))
259
0
    {
260
0
      *outbuf = outptr;
261
0
      *outbytesleft = outsize;
262
0
    }
263
0
  *incremented = false;
264
0
  return res;
265
0
}
266
267
/* utf8conv_carefully is like iconv, except that
268
     - it converts from UTF-8 to UTF-8,
269
     - it stops as soon as it encounters a conversion error, and it returns
270
       in *INCREMENTED a boolean telling whether it has incremented the input
271
       pointers past the error location,
272
     - if one_character_only is true, it stops after converting one
273
       character.  */
274
static size_t
275
utf8conv_carefully (bool one_character_only,
276
                    const char **inbuf, size_t *inbytesleft,
277
                    char **outbuf, size_t *outbytesleft,
278
                    bool *incremented)
279
0
{
280
0
  const char *inptr = *inbuf;
281
0
  size_t insize = *inbytesleft;
282
0
  char *outptr = *outbuf;
283
0
  size_t outsize = *outbytesleft;
284
0
  size_t res;
285
286
0
  res = 0;
287
0
  do
288
0
    {
289
0
      ucs4_t uc;
290
0
      int n;
291
0
      int m;
292
293
0
      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
294
0
      if (n < 0)
295
0
        {
296
0
          errno = (n == -2 ? EINVAL : EILSEQ);
297
0
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
298
0
          inptr += n;
299
0
          insize -= n;
300
0
          res = (size_t)(-1);
301
0
          *incremented = true;
302
0
          break;
303
0
        }
304
0
      if (outsize == 0)
305
0
        {
306
0
          errno = E2BIG;
307
0
          res = (size_t)(-1);
308
0
          *incremented = false;
309
0
          break;
310
0
        }
311
0
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
312
0
      if (m == -2)
313
0
        {
314
0
          errno = E2BIG;
315
0
          res = (size_t)(-1);
316
0
          *incremented = false;
317
0
          break;
318
0
        }
319
0
      inptr += n;
320
0
      insize -= n;
321
0
      if (m == -1)
322
0
        {
323
0
          errno = EILSEQ;
324
0
          res = (size_t)(-1);
325
0
          *incremented = true;
326
0
          break;
327
0
        }
328
0
      outptr += m;
329
0
      outsize -= m;
330
0
    }
331
0
  while (!one_character_only && insize > 0);
332
333
0
  *inbuf = inptr;
334
0
  *inbytesleft = insize;
335
0
  *outbuf = outptr;
336
0
  *outbytesleft = outsize;
337
0
  return res;
338
0
}
339
340
static int
341
mem_cd_iconveh_internal (const char *src, size_t srclen,
342
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
343
                         enum iconv_ilseq_handler handler,
344
                         size_t extra_alloc,
345
                         size_t *offsets,
346
                         char **resultp, size_t *lengthp)
347
0
{
348
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
349
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
350
     Instead, we have to start afresh from the beginning of SRC.  */
351
  /* Use a temporary buffer, so that for small strings, a single malloc()
352
     call will be sufficient.  */
353
0
# define tmpbufsize 4096
354
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
355
     libiconv's UCS-4-INTERNAL encoding.  */
356
0
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
357
0
# define tmpbuf tmp.buf
358
359
0
  char *initial_result;
360
0
  char *result;
361
0
  size_t allocated;
362
0
  size_t length;
363
0
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
364
365
0
  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
366
0
    {
367
0
      initial_result = *resultp;
368
0
      allocated = *lengthp;
369
0
    }
370
0
  else
371
0
    {
372
0
      initial_result = tmpbuf;
373
0
      allocated = sizeof (tmpbuf);
374
0
    }
375
0
  result = initial_result;
376
377
  /* Test whether a direct conversion is possible at all.  */
378
0
  if (cd == (iconv_t)(-1))
379
0
    goto indirectly;
380
381
0
  if (offsets != NULL)
382
0
    {
383
0
      size_t i;
384
385
0
      for (i = 0; i < srclen; i++)
386
0
        offsets[i] = (size_t)(-1);
387
388
0
      last_length = (size_t)(-1);
389
0
    }
390
0
  length = 0;
391
392
  /* First, try a direct conversion, and see whether a conversion error
393
     occurs at all.  */
394
0
  {
395
0
    const char *inptr = src;
396
0
    size_t insize = srclen;
397
398
    /* Set to the initial state.  */
399
0
    iconv (cd, NULL, NULL, NULL, NULL);
400
401
0
    while (insize > 0)
402
0
      {
403
0
        char *outptr = result + length;
404
0
        size_t outsize = allocated - extra_alloc - length;
405
0
        bool incremented;
406
0
        size_t res;
407
0
        bool grow;
408
409
0
        if (offsets != NULL)
410
0
          {
411
0
            if (length != last_length) /* ensure that offset[] be increasing */
412
0
              {
413
0
                offsets[inptr - src] = length;
414
0
                last_length = length;
415
0
              }
416
0
            res = iconv_carefully_1 (cd,
417
0
                                     &inptr, &insize,
418
0
                                     &outptr, &outsize,
419
0
                                     &incremented);
420
0
          }
421
0
        else
422
          /* Use iconv_carefully instead of iconv here, because:
423
             - If TO_CODESET is UTF-8, we can do the error handling in this
424
               loop, no need for a second loop,
425
             - With iconv() implementations other than GNU libiconv and GNU
426
               libc, if we use iconv() in a big swoop, checking for an E2BIG
427
               return, we lose the number of irreversible conversions.  */
428
0
          res = iconv_carefully (cd,
429
0
                                 &inptr, &insize,
430
0
                                 &outptr, &outsize,
431
0
                                 &incremented);
432
433
0
        length = outptr - result;
434
0
        grow = (length + extra_alloc > allocated / 2);
435
0
        if (res == (size_t)(-1))
436
0
          {
437
0
            if (errno == E2BIG)
438
0
              grow = true;
439
0
            else if (errno == EINVAL)
440
0
              break;
441
0
            else if (errno == EILSEQ && handler != iconveh_error)
442
0
              {
443
0
                if (cd2 == (iconv_t)(-1))
444
0
                  {
445
                    /* TO_CODESET is UTF-8.  */
446
                    /* Error handling can produce up to 1 or 3 bytes of
447
                       output.  */
448
0
                    size_t extra_need =
449
0
                      (handler == iconveh_replacement_character ? 3 : 1);
450
0
                    if (length + extra_need + extra_alloc > allocated)
451
0
                      {
452
0
                        char *memory;
453
454
0
                        allocated = 2 * allocated;
455
0
                        if (length + extra_need + extra_alloc > allocated)
456
0
                          allocated = 2 * allocated;
457
0
                        if (length + extra_need + extra_alloc > allocated)
458
0
                          abort ();
459
0
                        if (result == initial_result)
460
0
                          memory = (char *) malloc (allocated);
461
0
                        else
462
0
                          memory = (char *) realloc (result, allocated);
463
0
                        if (memory == NULL)
464
0
                          {
465
0
                            if (result != initial_result)
466
0
                              free (result);
467
0
                            errno = ENOMEM;
468
0
                            return -1;
469
0
                          }
470
0
                        if (result == initial_result)
471
0
                          memcpy (memory, initial_result, length);
472
0
                        result = memory;
473
0
                        grow = false;
474
0
                      }
475
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
476
                       and emit a replacement character or a question mark.  */
477
0
                    if (!incremented)
478
0
                      {
479
0
                        if (insize == 0)
480
0
                          abort ();
481
0
                        inptr++;
482
0
                        insize--;
483
0
                      }
484
0
                    if (handler == iconveh_replacement_character)
485
0
                      {
486
                        /* U+FFFD in UTF-8 encoding.  */
487
0
                        result[length+0] = '\357';
488
0
                        result[length+1] = '\277';
489
0
                        result[length+2] = '\275';
490
0
                        length += 3;
491
0
                      }
492
0
                    else
493
0
                      {
494
0
                        result[length] = '?';
495
0
                        length++;
496
0
                      }
497
0
                  }
498
0
                else
499
0
                  goto indirectly;
500
0
              }
501
0
            else
502
0
              {
503
0
                if (result != initial_result)
504
0
                  free (result);
505
0
                return -1;
506
0
              }
507
0
          }
508
0
        if (insize == 0)
509
0
          break;
510
0
        if (grow)
511
0
          {
512
0
            char *memory;
513
514
0
            allocated = 2 * allocated;
515
0
            if (result == initial_result)
516
0
              memory = (char *) malloc (allocated);
517
0
            else
518
0
              memory = (char *) realloc (result, allocated);
519
0
            if (memory == NULL)
520
0
              {
521
0
                if (result != initial_result)
522
0
                  free (result);
523
0
                errno = ENOMEM;
524
0
                return -1;
525
0
              }
526
0
            if (result == initial_result)
527
0
              memcpy (memory, initial_result, length);
528
0
            result = memory;
529
0
          }
530
0
      }
531
0
  }
532
533
  /* Now get the conversion state back to the initial state.
534
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
535
0
#if defined _LIBICONV_VERSION \
536
0
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
537
0
         || defined __sun)
538
0
  for (;;)
539
0
    {
540
0
      char *outptr = result + length;
541
0
      size_t outsize = allocated - extra_alloc - length;
542
0
      size_t res;
543
544
0
      res = iconv (cd, NULL, NULL, &outptr, &outsize);
545
0
      length = outptr - result;
546
0
      if (res == (size_t)(-1))
547
0
        {
548
0
          if (errno == E2BIG)
549
0
            {
550
0
              char *memory;
551
552
0
              allocated = 2 * allocated;
553
0
              if (result == initial_result)
554
0
                memory = (char *) malloc (allocated);
555
0
              else
556
0
                memory = (char *) realloc (result, allocated);
557
0
              if (memory == NULL)
558
0
                {
559
0
                  if (result != initial_result)
560
0
                    free (result);
561
0
                  errno = ENOMEM;
562
0
                  return -1;
563
0
                }
564
0
              if (result == initial_result)
565
0
                memcpy (memory, initial_result, length);
566
0
              result = memory;
567
0
            }
568
0
          else
569
0
            {
570
0
              if (result != initial_result)
571
0
                free (result);
572
0
              return -1;
573
0
            }
574
0
        }
575
0
      else
576
0
        break;
577
0
    }
578
0
#endif
579
580
  /* The direct conversion succeeded.  */
581
0
  goto done;
582
583
0
 indirectly:
584
  /* The direct conversion failed.
585
     Use a conversion through UTF-8.  */
586
0
  if (offsets != NULL)
587
0
    {
588
0
      size_t i;
589
590
0
      for (i = 0; i < srclen; i++)
591
0
        offsets[i] = (size_t)(-1);
592
593
0
      last_length = (size_t)(-1);
594
0
    }
595
0
  length = 0;
596
0
  {
597
0
    const bool slowly = (offsets != NULL || handler == iconveh_error);
598
0
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
599
0
    char utf8buf[utf8bufsize + 3];
600
0
    size_t utf8len = 0;
601
0
    const char *in1ptr = src;
602
0
    size_t in1size = srclen;
603
0
    bool do_final_flush1 = true;
604
0
    bool do_final_flush2 = true;
605
606
    /* Set to the initial state.  */
607
0
    if (cd1 != (iconv_t)(-1))
608
0
      iconv (cd1, NULL, NULL, NULL, NULL);
609
0
    if (cd2 != (iconv_t)(-1))
610
0
      iconv (cd2, NULL, NULL, NULL, NULL);
611
612
0
    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
613
0
      {
614
0
        char *out1ptr = utf8buf + utf8len;
615
0
        size_t out1size = utf8bufsize - utf8len;
616
0
        bool incremented1;
617
0
        size_t res1;
618
0
        int errno1;
619
620
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
621
0
        if (in1size > 0)
622
0
          {
623
0
            if (offsets != NULL
624
0
                && length != last_length) /* ensure that offset[] be increasing */
625
0
              {
626
0
                offsets[in1ptr - src] = length;
627
0
                last_length = length;
628
0
              }
629
0
            if (cd1 != (iconv_t)(-1))
630
0
              {
631
0
                if (slowly)
632
0
                  res1 = iconv_carefully_1 (cd1,
633
0
                                            &in1ptr, &in1size,
634
0
                                            &out1ptr, &out1size,
635
0
                                            &incremented1);
636
0
                else
637
0
                  res1 = iconv_carefully (cd1,
638
0
                                          &in1ptr, &in1size,
639
0
                                          &out1ptr, &out1size,
640
0
                                          &incremented1);
641
0
              }
642
0
            else
643
0
              {
644
                /* FROM_CODESET is UTF-8.  */
645
0
                res1 = utf8conv_carefully (slowly,
646
0
                                           &in1ptr, &in1size,
647
0
                                           &out1ptr, &out1size,
648
0
                                           &incremented1);
649
0
              }
650
0
          }
651
0
        else if (do_final_flush1)
652
0
          {
653
            /* Now get the conversion state of CD1 back to the initial state.
654
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
655
0
# if defined _LIBICONV_VERSION \
656
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
657
0
          || defined __sun)
658
0
            if (cd1 != (iconv_t)(-1))
659
0
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
660
0
            else
661
0
# endif
662
0
              res1 = 0;
663
0
            do_final_flush1 = false;
664
0
            incremented1 = true;
665
0
          }
666
0
        else
667
0
          {
668
0
            res1 = 0;
669
0
            incremented1 = true;
670
0
          }
671
0
        if (res1 == (size_t)(-1)
672
0
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
673
0
          {
674
0
            if (result != initial_result)
675
0
              free (result);
676
0
            return -1;
677
0
          }
678
0
        if (res1 == (size_t)(-1)
679
0
            && errno == EILSEQ && handler != iconveh_error)
680
0
          {
681
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
682
               emit a U+FFFD character or a question mark.  Room for this
683
               character was allocated at the end of utf8buf.  */
684
0
            if (!incremented1)
685
0
              {
686
0
                if (in1size == 0)
687
0
                  abort ();
688
0
                in1ptr++;
689
0
                in1size--;
690
0
              }
691
0
            if (handler == iconveh_replacement_character)
692
0
              {
693
                /* U+FFFD in UTF-8 encoding.  */
694
0
                out1ptr[0] = '\357';
695
0
                out1ptr[1] = '\277';
696
0
                out1ptr[2] = '\275';
697
0
                out1ptr += 3;
698
0
              }
699
0
            else
700
0
              *out1ptr++ = '?';
701
0
            res1 = 0;
702
0
          }
703
0
        errno1 = errno;
704
0
        utf8len = out1ptr - utf8buf;
705
706
0
        if (offsets != NULL
707
0
            || in1size == 0
708
0
            || utf8len > utf8bufsize / 2
709
0
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
710
0
          {
711
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
712
0
            const char *in2ptr = utf8buf;
713
0
            size_t in2size = utf8len;
714
715
0
            while (in2size > 0
716
0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
717
0
              {
718
0
                char *out2ptr = result + length;
719
0
                size_t out2size = allocated - extra_alloc - length;
720
0
                bool incremented2;
721
0
                size_t res2;
722
0
                bool grow;
723
724
0
                if (in2size > 0)
725
0
                  {
726
0
                    if (cd2 != (iconv_t)(-1))
727
0
                      res2 = iconv_carefully (cd2,
728
0
                                              &in2ptr, &in2size,
729
0
                                              &out2ptr, &out2size,
730
0
                                              &incremented2);
731
0
                    else
732
                      /* TO_CODESET is UTF-8.  */
733
0
                      res2 = utf8conv_carefully (false,
734
0
                                                 &in2ptr, &in2size,
735
0
                                                 &out2ptr, &out2size,
736
0
                                                 &incremented2);
737
0
                  }
738
0
                else /* in1size == 0 && !do_final_flush1
739
                        && in2size == 0 && do_final_flush2 */
740
0
                  {
741
                    /* Now get the conversion state of CD1 back to the initial
742
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
743
0
# if defined _LIBICONV_VERSION \
744
0
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
745
0
          || defined __sun)
746
0
                    if (cd2 != (iconv_t)(-1))
747
0
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
748
0
                    else
749
0
# endif
750
0
                      res2 = 0;
751
0
                    do_final_flush2 = false;
752
0
                    incremented2 = true;
753
0
                  }
754
755
0
                length = out2ptr - result;
756
0
                grow = (length + extra_alloc > allocated / 2);
757
0
                if (res2 == (size_t)(-1))
758
0
                  {
759
0
                    if (errno == E2BIG)
760
0
                      grow = true;
761
0
                    else if (errno == EINVAL)
762
0
                      break;
763
0
                    else if (errno == EILSEQ && handler != iconveh_error)
764
0
                      {
765
                        /* Error handling can produce up to 10 bytes of UTF-8
766
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
767
                           UCS-4, so use CD2 here as well.  */
768
0
                        char scratchbuf[10];
769
0
                        size_t scratchlen;
770
0
                        ucs4_t uc;
771
0
                        const char *inptr;
772
0
                        size_t insize;
773
0
                        size_t res;
774
775
0
                        if (incremented2)
776
0
                          {
777
0
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
778
0
                                         (const uint8_t *) utf8buf)
779
0
                                == NULL)
780
0
                              abort ();
781
0
                          }
782
0
                        else
783
0
                          {
784
0
                            int n;
785
0
                            if (in2size == 0)
786
0
                              abort ();
787
0
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
788
0
                                                  in2size);
789
0
                            in2ptr += n;
790
0
                            in2size -= n;
791
0
                          }
792
793
0
                        if (handler == iconveh_escape_sequence)
794
0
                          {
795
0
                            static char const hex[16] = "0123456789ABCDEF";
796
0
                            scratchlen = 0;
797
0
                            scratchbuf[scratchlen++] = '\\';
798
0
                            if (uc < 0x10000)
799
0
                              scratchbuf[scratchlen++] = 'u';
800
0
                            else
801
0
                              {
802
0
                                scratchbuf[scratchlen++] = 'U';
803
0
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
804
0
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
805
0
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
806
0
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
807
0
                              }
808
0
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
809
0
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
810
0
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
811
0
                            scratchbuf[scratchlen++] = hex[uc & 15];
812
0
                          }
813
0
                        else if (handler == iconveh_replacement_character)
814
0
                          {
815
                            /* U+FFFD in UTF-8 encoding.  */
816
0
                            scratchbuf[0] = '\357';
817
0
                            scratchbuf[1] = '\277';
818
0
                            scratchbuf[2] = '\275';
819
0
                            scratchlen = 3;
820
0
                          }
821
0
                        else
822
0
                          {
823
0
                            scratchbuf[0] = '?';
824
0
                            scratchlen = 1;
825
0
                          }
826
827
0
                        inptr = scratchbuf;
828
0
                        insize = scratchlen;
829
0
                        if (cd2 != (iconv_t)(-1))
830
0
                          {
831
0
                            char *out2ptr_try = out2ptr;
832
0
                            size_t out2size_try = out2size;
833
0
                            res = iconv (cd2,
834
0
                                         (ICONV_CONST char **) &inptr, &insize,
835
0
                                         &out2ptr_try, &out2size_try);
836
0
                            if (handler == iconveh_replacement_character
837
0
                                && (res == (size_t)(-1)
838
0
                                    ? errno == EILSEQ
839
                                    /* FreeBSD iconv(), NetBSD iconv(), and
840
                                       Solaris 11 iconv() insert a '?' if they
841
                                       cannot convert.  This is what we want.
842
                                       But IRIX iconv() inserts a NUL byte if it
843
                                       cannot convert.
844
                                       And musl libc iconv() inserts a '*' if it
845
                                       cannot convert.  */
846
0
                                    : (res > 0
847
0
                                       && !(out2ptr_try - out2ptr == 1
848
0
                                            && *out2ptr == '?'))))
849
0
                              {
850
                                /* The iconv() call failed.
851
                                   U+FFFD can't be converted to TO_CODESET.
852
                                   Use '?' instead.  */
853
0
                                scratchbuf[0] = '?';
854
0
                                scratchlen = 1;
855
0
                                inptr = scratchbuf;
856
0
                                insize = scratchlen;
857
0
                                res = iconv (cd2,
858
0
                                             (ICONV_CONST char **) &inptr, &insize,
859
0
                                             &out2ptr, &out2size);
860
0
                              }
861
0
                            else
862
0
                              {
863
                                /* Accept the results of the iconv() call.  */
864
0
                                out2ptr = out2ptr_try;
865
0
                                out2size = out2size_try;
866
0
                                res = 0;
867
0
                              }
868
0
                          }
869
0
                        else
870
0
                          {
871
                            /* TO_CODESET is UTF-8.  */
872
0
                            if (out2size >= insize)
873
0
                              {
874
0
                                memcpy (out2ptr, inptr, insize);
875
0
                                out2ptr += insize;
876
0
                                out2size -= insize;
877
0
                                inptr += insize;
878
0
                                insize = 0;
879
0
                                res = 0;
880
0
                              }
881
0
                            else
882
0
                              {
883
0
                                errno = E2BIG;
884
0
                                res = (size_t)(-1);
885
0
                              }
886
0
                          }
887
0
                        length = out2ptr - result;
888
0
                        if (res == (size_t)(-1) && errno == E2BIG)
889
0
                          {
890
0
                            char *memory;
891
892
0
                            allocated = 2 * allocated;
893
0
                            if (length + 1 + extra_alloc > allocated)
894
0
                              abort ();
895
0
                            if (result == initial_result)
896
0
                              memory = (char *) malloc (allocated);
897
0
                            else
898
0
                              memory = (char *) realloc (result, allocated);
899
0
                            if (memory == NULL)
900
0
                              {
901
0
                                if (result != initial_result)
902
0
                                  free (result);
903
0
                                errno = ENOMEM;
904
0
                                return -1;
905
0
                              }
906
0
                            if (result == initial_result)
907
0
                              memcpy (memory, initial_result, length);
908
0
                            result = memory;
909
0
                            grow = false;
910
911
0
                            out2ptr = result + length;
912
0
                            out2size = allocated - extra_alloc - length;
913
0
                            if (cd2 != (iconv_t)(-1))
914
0
                              res = iconv (cd2,
915
0
                                           (ICONV_CONST char **) &inptr,
916
0
                                           &insize,
917
0
                                           &out2ptr, &out2size);
918
0
                            else
919
0
                              {
920
                                /* TO_CODESET is UTF-8.  */
921
0
                                if (!(out2size >= insize))
922
0
                                  abort ();
923
0
                                memcpy (out2ptr, inptr, insize);
924
0
                                out2ptr += insize;
925
0
                                out2size -= insize;
926
0
                                inptr += insize;
927
0
                                insize = 0;
928
0
                                res = 0;
929
0
                              }
930
0
                            length = out2ptr - result;
931
0
                          }
932
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
933
     && !(defined __GLIBC__ && !defined __UCLIBC__)
934
                        /* IRIX iconv() inserts a NUL byte if it cannot convert.
935
                           FreeBSD iconv(), NetBSD iconv(), and Solaris 11
936
                           iconv() insert a '?' if they cannot convert.
937
                           musl libc iconv() inserts a '*' if it cannot convert.
938
                           Only GNU libiconv (excluding the bastard Apple iconv)
939
                           and GNU libc are known to prefer to fail rather than
940
                           doing a lossy conversion.  */
941
                        if (res != (size_t)(-1) && res > 0)
942
                          {
943
                            errno = EILSEQ;
944
                            res = (size_t)(-1);
945
                          }
946
# endif
947
0
                        if (res == (size_t)(-1))
948
0
                          {
949
                            /* Failure converting the ASCII replacement.  */
950
0
                            if (result != initial_result)
951
0
                              free (result);
952
0
                            return -1;
953
0
                          }
954
0
                      }
955
0
                    else
956
0
                      {
957
0
                        if (result != initial_result)
958
0
                          free (result);
959
0
                        return -1;
960
0
                      }
961
0
                  }
962
0
                if (!(in2size > 0
963
0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
964
0
                  break;
965
0
                if (grow)
966
0
                  {
967
0
                    char *memory;
968
969
0
                    allocated = 2 * allocated;
970
0
                    if (result == initial_result)
971
0
                      memory = (char *) malloc (allocated);
972
0
                    else
973
0
                      memory = (char *) realloc (result, allocated);
974
0
                    if (memory == NULL)
975
0
                      {
976
0
                        if (result != initial_result)
977
0
                          free (result);
978
0
                        errno = ENOMEM;
979
0
                        return -1;
980
0
                      }
981
0
                    if (result == initial_result)
982
0
                      memcpy (memory, initial_result, length);
983
0
                    result = memory;
984
0
                  }
985
0
              }
986
987
            /* Move the remaining bytes to the beginning of utf8buf.  */
988
0
            if (in2size > 0)
989
0
              memmove (utf8buf, in2ptr, in2size);
990
0
            utf8len = in2size;
991
0
          }
992
993
0
        if (res1 == (size_t)(-1))
994
0
          {
995
0
            if (errno1 == EINVAL)
996
0
              in1size = 0;
997
0
            else if (errno1 == EILSEQ)
998
0
              {
999
0
                if (result != initial_result)
1000
0
                  free (result);
1001
0
                errno = errno1;
1002
0
                return -1;
1003
0
              }
1004
0
          }
1005
0
      }
1006
0
# undef utf8bufsize
1007
0
  }
1008
1009
0
 done:
1010
  /* Now the final memory allocation.  */
1011
0
  if (result == tmpbuf)
1012
0
    {
1013
0
      size_t memsize = length + extra_alloc;
1014
1015
0
      if (*resultp != NULL && *lengthp >= memsize)
1016
0
        result = *resultp;
1017
0
      else
1018
0
        {
1019
0
          char *memory;
1020
1021
0
          memory = (char *) malloc (memsize > 0 ? memsize : 1);
1022
0
          if (memory != NULL)
1023
0
            result = memory;
1024
0
          else
1025
0
            {
1026
0
              errno = ENOMEM;
1027
0
              return -1;
1028
0
            }
1029
0
        }
1030
0
      memcpy (result, tmpbuf, length);
1031
0
    }
1032
0
  else if (result != *resultp && length + extra_alloc < allocated)
1033
0
    {
1034
      /* Shrink the allocated memory if possible.  */
1035
0
      size_t memsize = length + extra_alloc;
1036
0
      char *memory;
1037
1038
0
      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1039
0
      if (memory != NULL)
1040
0
        result = memory;
1041
0
    }
1042
0
  *resultp = result;
1043
0
  *lengthp = length;
1044
0
  return 0;
1045
0
# undef tmpbuf
1046
0
# undef tmpbufsize
1047
0
}
1048
1049
int
1050
mem_cd_iconveh (const char *src, size_t srclen,
1051
                const iconveh_t *cd,
1052
                enum iconv_ilseq_handler handler,
1053
                size_t *offsets,
1054
                char **resultp, size_t *lengthp)
1055
0
{
1056
0
  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1057
0
                                  handler, 0, offsets, resultp, lengthp);
1058
0
}
1059
1060
char *
1061
str_cd_iconveh (const char *src,
1062
                const iconveh_t *cd,
1063
                enum iconv_ilseq_handler handler)
1064
0
{
1065
  /* For most encodings, a trailing NUL byte in the input will be converted
1066
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1067
     function is usable for UTF-7, we have to exclude the NUL byte from the
1068
     conversion and add it by hand afterwards.  */
1069
0
  char *result = NULL;
1070
0
  size_t length = 0;
1071
0
  int retval = mem_cd_iconveh_internal (src, strlen (src),
1072
0
                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
1073
0
                                        NULL, &result, &length);
1074
1075
0
  if (retval < 0)
1076
0
    {
1077
0
      free (result);
1078
0
      return NULL;
1079
0
    }
1080
1081
  /* Add the terminating NUL byte.  */
1082
0
  result[length] = '\0';
1083
1084
0
  return result;
1085
0
}
1086
1087
#endif
1088
1089
int
1090
mem_iconveh (const char *src, size_t srclen,
1091
             const char *from_codeset, const char *to_codeset,
1092
             enum iconv_ilseq_handler handler,
1093
             size_t *offsets,
1094
             char **resultp, size_t *lengthp)
1095
0
{
1096
0
  if (srclen == 0)
1097
0
    {
1098
      /* Nothing to convert.  */
1099
0
      *lengthp = 0;
1100
0
      return 0;
1101
0
    }
1102
0
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1103
0
    {
1104
0
      char *result;
1105
1106
0
      if (*resultp != NULL && *lengthp >= srclen)
1107
0
        result = *resultp;
1108
0
      else
1109
0
        {
1110
0
          result = (char *) malloc (srclen);
1111
0
          if (result == NULL)
1112
0
            {
1113
0
              errno = ENOMEM;
1114
0
              return -1;
1115
0
            }
1116
0
        }
1117
0
      memcpy (result, src, srclen);
1118
0
      *resultp = result;
1119
0
      *lengthp = srclen;
1120
0
      return 0;
1121
0
    }
1122
0
  else
1123
0
    {
1124
0
#if HAVE_ICONV
1125
0
      iconveh_t cd;
1126
0
      char *result;
1127
0
      size_t length;
1128
0
      int retval;
1129
1130
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1131
0
        return -1;
1132
1133
0
      result = *resultp;
1134
0
      length = *lengthp;
1135
0
      retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1136
0
                               &result, &length);
1137
1138
0
      if (retval < 0)
1139
0
        {
1140
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1141
0
          int saved_errno = errno;
1142
0
          iconveh_close (&cd);
1143
0
          errno = saved_errno;
1144
0
        }
1145
0
      else
1146
0
        {
1147
0
          if (iconveh_close (&cd) < 0)
1148
0
            {
1149
0
              if (result != *resultp)
1150
0
                free (result);
1151
0
              return -1;
1152
0
            }
1153
0
          *resultp = result;
1154
0
          *lengthp = length;
1155
0
        }
1156
0
      return retval;
1157
#else
1158
      /* This is a different error code than if iconv_open existed but didn't
1159
         support from_codeset and to_codeset, so that the caller can emit
1160
         an error message such as
1161
           "iconv() is not supported. Installing GNU libiconv and
1162
            then reinstalling this package would fix this."  */
1163
      errno = ENOSYS;
1164
      return -1;
1165
#endif
1166
0
    }
1167
0
}
1168
1169
char *
1170
str_iconveh (const char *src,
1171
             const char *from_codeset, const char *to_codeset,
1172
             enum iconv_ilseq_handler handler)
1173
0
{
1174
0
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1175
0
    {
1176
0
      char *result = strdup (src);
1177
1178
0
      if (result == NULL)
1179
0
        errno = ENOMEM;
1180
0
      return result;
1181
0
    }
1182
0
  else
1183
0
    {
1184
0
#if HAVE_ICONV
1185
0
      iconveh_t cd;
1186
0
      char *result;
1187
1188
0
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1189
0
        return NULL;
1190
1191
0
      result = str_cd_iconveh (src, &cd, handler);
1192
1193
0
      if (result == NULL)
1194
0
        {
1195
          /* Close cd, but preserve the errno from str_cd_iconv.  */
1196
0
          int saved_errno = errno;
1197
0
          iconveh_close (&cd);
1198
0
          errno = saved_errno;
1199
0
        }
1200
0
      else
1201
0
        {
1202
0
          if (iconveh_close (&cd) < 0)
1203
0
            {
1204
0
              free (result);
1205
0
              return NULL;
1206
0
            }
1207
0
        }
1208
0
      return result;
1209
#else
1210
      /* This is a different error code than if iconv_open existed but didn't
1211
         support from_codeset and to_codeset, so that the caller can emit
1212
         an error message such as
1213
           "iconv() is not supported. Installing GNU libiconv and
1214
            then reinstalling this package would fix this."  */
1215
      errno = ENOSYS;
1216
      return NULL;
1217
#endif
1218
0
    }
1219
0
}