/src/libunistring/lib/striconveh.c

Source
/* Character set conversion with error handling.
   Copyright (C) 2001-2026 Free Software Foundation, Inc.
   Written by Bruno Haible and Simon Josefsson.

   This file is free software: you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as
   published by the Free Software Foundation; either version 2.1 of the
   License, or (at your option) any later version.

   This file is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "striconveh.h"

#include <errno.h>
#include <stdlib.h>
#include <string.h>

#if HAVE_ICONV
# include <iconv.h>
# include "unistr.h"
#endif

#include "c-strcase.h"
#include "c-strcaseeq.h"

#ifndef SIZE_MAX
# define SIZE_MAX ((size_t) -1)
#endif


#if HAVE_ICONV

/* The caller must provide an iconveh_t, not just an iconv_t, because when a
   conversion error occurs, we may have to determine the Unicode representation
   of the inconvertible character.  */

int
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
{
  iconv_t cd = iconv_open (to_codeset, from_codeset);

  iconv_t cd1;
  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
    cd1 = (iconv_t)(-1);
  else
    {
      cd1 = iconv_open ("UTF-8", from_codeset);
      if (cd1 == (iconv_t)(-1))
        {
          int saved_errno = errno;
          if (cd != (iconv_t)(-1))
            iconv_close (cd);
          errno = saved_errno;
          return -1;
        }
    }

  iconv_t cd2;
  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
      && !defined __UCLIBC__) \
     || _LIBICONV_VERSION >= 0x0105 \
     || defined ICONV_SET_TRANSLITERATE
      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
# endif
     )
    cd2 = (iconv_t)(-1);
  else
    {
      cd2 = iconv_open (to_codeset, "UTF-8");
      if (cd2 == (iconv_t)(-1))
        {
          int saved_errno = errno;
          if (cd1 != (iconv_t)(-1))
            iconv_close (cd1);
          if (cd != (iconv_t)(-1))
            iconv_close (cd);
          errno = saved_errno;
          return -1;
        }
    }

  cdp->cd = cd;
  cdp->cd1 = cd1;
  cdp->cd2 = cd2;
  return 0;
}

int
iconveh_close (const iconveh_t *cd)
{
  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
    {
      /* Return -1, but preserve the errno from iconv_close.  */
      int saved_errno = errno;
      if (cd->cd1 != (iconv_t)(-1))
        iconv_close (cd->cd1);
      if (cd->cd != (iconv_t)(-1))
        iconv_close (cd->cd);
      errno = saved_errno;
      return -1;
    }
  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
    {
      /* Return -1, but preserve the errno from iconv_close.  */
      int saved_errno = errno;
      if (cd->cd != (iconv_t)(-1))
        iconv_close (cd->cd);
      errno = saved_errno;
      return -1;
    }
  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
    return -1;
  return 0;
}

/* iconv_carefully is like iconv, except that it stops as soon as it encounters
   a conversion error, and it returns in *INCREMENTED a boolean telling whether
   it has incremented the input pointers past the error location.  */
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
     && !(defined __GLIBC__ && !defined __UCLIBC__)
/* NetBSD iconv() inserts a question mark if it cannot convert.
   Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
   known to prefer to fail rather than doing a lossy conversion.  */
static size_t
iconv_carefully (iconv_t cd,
                 const char **inbuf, size_t *inbytesleft,
                 char **outbuf, size_t *outbytesleft,
                 bool *incremented)
{
  const char *inptr = *inbuf;
  const char *inptr_end = inptr + *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  const char *inptr_before;
  size_t res;

  do
    {
      inptr_before = inptr;
      res = (size_t)(-1);

      for (size_t insize = 1; inptr + insize <= inptr_end; insize++)
        {
          res = iconv (cd,
                       (ICONV_CONST char **) &inptr, &insize,
                       &outptr, &outsize);
          if (!(res == (size_t)(-1) && errno == EINVAL))
            break;
          /* iconv can eat up a shift sequence but give EINVAL while attempting
             to convert the first character.  E.g. libiconv does this.  */
          if (inptr > inptr_before)
            {
              res = 0;
              break;
            }
        }

      if (res == 0)
        {
          *outbuf = outptr;
          *outbytesleft = outsize;
        }
    }
  while (res == 0 && inptr < inptr_end);

  *inbuf = inptr;
  *inbytesleft = inptr_end - inptr;
  if (res != (size_t)(-1) && res > 0)
    {
      /* iconv() has already incremented INPTR.  We cannot go back to a
         previous INPTR, otherwise the state inside CD would become invalid,
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
         *INBUF has already been incremented.  */
      *incremented = (inptr > inptr_before);
      errno = EILSEQ;
      return (size_t)(-1);
    }
  else
    {
      *incremented = false;
      return res;
    }
}
# else
#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
     (*(incremented) = false, \
      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
# endif

/* iconv_carefully_1 is like iconv_carefully, except that it stops after
   converting one character or one shift sequence.  */
static size_t
iconv_carefully_1 (iconv_t cd,
                   const char **inbuf, size_t *inbytesleft,
                   char **outbuf, size_t *outbytesleft,
                   bool *incremented)
{
  const char *inptr_before = *inbuf;
  const char *inptr = inptr_before;
  const char *inptr_end = inptr_before + *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res = (size_t)(-1);

  for (size_t insize = 1; inptr_before + insize <= inptr_end; insize++)
    {
      inptr = inptr_before;
      res = iconv (cd,
                   (ICONV_CONST char **) &inptr, &insize,
                   &outptr, &outsize);
      if (!(res == (size_t)(-1) && errno == EINVAL))
        break;
      /* iconv can eat up a shift sequence but give EINVAL while attempting
         to convert the first character.  E.g. libiconv does this.  */
      if (inptr > inptr_before)
        {
          res = 0;
          break;
        }
    }

  *inbuf = inptr;
  *inbytesleft = inptr_end - inptr;
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
     && !(defined __GLIBC__ && !defined __UCLIBC__)
  /* NetBSD iconv() inserts a question mark if it cannot convert.
     Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
     known to prefer to fail rather than doing a lossy conversion.  */
  if (res != (size_t)(-1) && res > 0)
    {
      /* iconv() has already incremented INPTR.  We cannot go back to a
         previous INPTR, otherwise the state inside CD would become invalid,
         if FROM_CODESET is a stateful encoding.  So, tell the caller that
         *INBUF has already been incremented.  */
      *incremented = (inptr > inptr_before);
      errno = EILSEQ;
      return (size_t)(-1);
    }
# endif

  if (res != (size_t)(-1))
    {
      *outbuf = outptr;
      *outbytesleft = outsize;
    }
  *incremented = false;
  return res;
}

/* utf8conv_carefully is like iconv, except that
     - it converts from UTF-8 to UTF-8,
     - it stops as soon as it encounters a conversion error, and it returns
       in *INCREMENTED a boolean telling whether it has incremented the input
       pointers past the error location,
     - if one_character_only is true, it stops after converting one
       character.  */
static size_t
utf8conv_carefully (bool one_character_only,
                    const char **inbuf, size_t *inbytesleft,
                    char **outbuf, size_t *outbytesleft,
                    bool *incremented)
{
  const char *inptr = *inbuf;
  size_t insize = *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res = 0;
  do
    {
      ucs4_t uc;
      int n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
      if (n < 0)
        {
          errno = (n == -2 ? EINVAL : EILSEQ);
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
          inptr += n;
          insize -= n;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      if (outsize == 0)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      int m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
      if (m == -2)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      inptr += n;
      insize -= n;
      if (m == -1)
        {
          errno = EILSEQ;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      outptr += m;
      outsize -= m;
    }
  while (!one_character_only && insize > 0);

  *inbuf = inptr;
  *inbytesleft = insize;
  *outbuf = outptr;
  *outbytesleft = outsize;
  return res;
}

static int
mem_cd_iconveh_internal (const char *src, size_t srclen,
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
                         enum iconv_ilseq_handler handler,
                         size_t extra_alloc,
                         size_t *offsets,
                         char **resultp, size_t *lengthp)
{
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
     Instead, we have to start afresh from the beginning of SRC.  */
  /* Use a temporary buffer, so that for small strings, a single malloc()
     call will be sufficient.  */
# define tmpbufsize 4096
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
     libiconv's UCS-4-INTERNAL encoding.  */
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
# define tmpbuf tmp.buf

  char *initial_result;
  size_t allocated;
  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
    {
      initial_result = *resultp;
      allocated = *lengthp;
    }
  else
    {
      initial_result = tmpbuf;
      allocated = sizeof (tmpbuf);
    }

  char *result = initial_result;

  /* Test whether a direct conversion is possible at all.  */
  if (cd == (iconv_t)(-1))
    goto indirectly;

  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  if (offsets != NULL)
    {
      for (size_t i = 0; i < srclen; i++)
        offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  size_t length = 0;

  /* First, try a direct conversion, and see whether a conversion error
     occurs at all.  */
  {
    /* Set to the initial state.  */
    iconv (cd, NULL, NULL, NULL, NULL);

    const char *inptr = src;
    size_t insize = srclen;

    while (insize > 0)
      {
        char *outptr = result + length;
        size_t outsize = allocated - extra_alloc - length;

        bool incremented;
        size_t res;
        if (offsets != NULL)
          {
            if (length != last_length) /* ensure that offset[] be increasing */
              {
                offsets[inptr - src] = length;
                last_length = length;
              }
            res = iconv_carefully_1 (cd,
                                     &inptr, &insize,
                                     &outptr, &outsize,
                                     &incremented);
          }
        else
          /* Use iconv_carefully instead of iconv here, because:
             - If TO_CODESET is UTF-8, we can do the error handling in this
               loop, no need for a second loop,
             - With iconv() implementations other than GNU libiconv and GNU
               libc, if we use iconv() in a big swoop, checking for an E2BIG
               return, we lose the number of irreversible conversions.  */
          res = iconv_carefully (cd,
                                 &inptr, &insize,
                                 &outptr, &outsize,
                                 &incremented);

        length = outptr - result;
        bool grow = (length + extra_alloc > allocated / 2);
        if (res == (size_t)(-1))
          {
            if (errno == E2BIG)
              grow = true;
            else if (errno == EINVAL)
              break;
            else if (errno == EILSEQ && handler != iconveh_error)
              {
                if (cd2 == (iconv_t)(-1))
                  {
                    /* TO_CODESET is UTF-8.  */
                    /* Error handling can produce up to 1 or 3 bytes of
                       output.  */
                    size_t extra_need =
                      (handler == iconveh_replacement_character ? 3 : 1);
                    if (length + extra_need + extra_alloc > allocated)
                      {
                        allocated = 2 * allocated;
                        if (length + extra_need + extra_alloc > allocated)
                          allocated = 2 * allocated;
                        if (length + extra_need + extra_alloc > allocated)
                          abort ();
                        char *memory;
                        if (result == initial_result)
                          memory = (char *) malloc (allocated);
                        else
                          memory = (char *) realloc (result, allocated);
                        if (memory == NULL)
                          {
                            if (result != initial_result)
                              free (result);
                            errno = ENOMEM;
                            return -1;
                          }
                        if (result == initial_result)
                          memcpy (memory, initial_result, length);
                        result = memory;
                        grow = false;
                      }
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
                       and emit a replacement character or a question mark.  */
                    if (!incremented)
                      {
                        if (insize == 0)
                          abort ();
                        inptr++;
                        insize--;
                      }
                    if (handler == iconveh_replacement_character)
                      {
                        /* U+FFFD in UTF-8 encoding.  */
                        result[length+0] = '\357';
                        result[length+1] = '\277';
                        result[length+2] = '\275';
                        length += 3;
                      }
                    else
                      {
                        result[length] = '?';
                        length++;
                      }
                  }
                else
                  goto indirectly;
              }
            else
              {
                if (result != initial_result)
                  free (result);
                return -1;
              }
          }
        if (insize == 0)
          break;
        if (grow)
          {
            allocated = 2 * allocated;
            char *memory;
            if (result == initial_result)
              memory = (char *) malloc (allocated);
            else
              memory = (char *) realloc (result, allocated);
            if (memory == NULL)
              {
                if (result != initial_result)
                  free (result);
                errno = ENOMEM;
                return -1;
              }
            if (result == initial_result)
              memcpy (memory, initial_result, length);
            result = memory;
          }
      }
  }

  /* Now get the conversion state back to the initial state.
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
#if defined _LIBICONV_VERSION \
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
         || defined __sun)
  for (;;)
    {
      char *outptr = result + length;
      size_t outsize = allocated - extra_alloc - length;
      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
      length = outptr - result;
      if (res == (size_t)(-1))
        {
          if (errno == E2BIG)
            {

              allocated = 2 * allocated;
              char *memory;
              if (result == initial_result)
                memory = (char *) malloc (allocated);
              else
                memory = (char *) realloc (result, allocated);
              if (memory == NULL)
                {
                  if (result != initial_result)
                    free (result);
                  errno = ENOMEM;
                  return -1;
                }
              if (result == initial_result)
                memcpy (memory, initial_result, length);
              result = memory;
            }
          else
            {
              if (result != initial_result)
                free (result);
              return -1;
            }
        }
      else
        break;
    }
#endif

  /* The direct conversion succeeded.  */
  goto done;

 indirectly:
  /* The direct conversion failed.
     Use a conversion through UTF-8.  */
  if (offsets != NULL)
    {
      for (size_t i = 0; i < srclen; i++)
        offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  length = 0;
  {
    /* Set to the initial state.  */
    if (cd1 != (iconv_t)(-1))
      iconv (cd1, NULL, NULL, NULL, NULL);
    if (cd2 != (iconv_t)(-1))
      iconv (cd2, NULL, NULL, NULL, NULL);

    const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
    char utf8buf[utf8bufsize + 3];
    size_t utf8len = 0;
    const char *in1ptr = src;
    size_t in1size = srclen;
    bool do_final_flush1 = true;
    bool do_final_flush2 = true;

    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
      {
        char *out1ptr = utf8buf + utf8len;
        size_t out1size = utf8bufsize - utf8len;

        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
        bool incremented1;
        size_t res1;
        if (in1size > 0)
          {
            if (offsets != NULL
                && length != last_length) /* ensure that offset[] be increasing */
              {
                offsets[in1ptr - src] = length;
                last_length = length;
              }
            if (cd1 != (iconv_t)(-1))
              {
                if (slowly)
                  res1 = iconv_carefully_1 (cd1,
                                            &in1ptr, &in1size,
                                            &out1ptr, &out1size,
                                            &incremented1);
                else
                  res1 = iconv_carefully (cd1,
                                          &in1ptr, &in1size,
                                          &out1ptr, &out1size,
                                          &incremented1);
              }
            else
              {
                /* FROM_CODESET is UTF-8.  */
                res1 = utf8conv_carefully (slowly,
                                           &in1ptr, &in1size,
                                           &out1ptr, &out1size,
                                           &incremented1);
              }
          }
        else if (do_final_flush1)
          {
            /* Now get the conversion state of CD1 back to the initial state.
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
            if (cd1 != (iconv_t)(-1))
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
            else
# endif
              res1 = 0;
            do_final_flush1 = false;
            incremented1 = true;
          }
        else
          {
            res1 = 0;
            incremented1 = true;
          }
        if (res1 == (size_t)(-1)
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
          {
            if (result != initial_result)
              free (result);
            return -1;
          }
        if (res1 == (size_t)(-1)
            && errno == EILSEQ && handler != iconveh_error)
          {
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
               emit a U+FFFD character or a question mark.  Room for this
               character was allocated at the end of utf8buf.  */
            if (!incremented1)
              {
                if (in1size == 0)
                  abort ();
                in1ptr++;
                in1size--;
              }
            if (handler == iconveh_replacement_character)
              {
                /* U+FFFD in UTF-8 encoding.  */
                out1ptr[0] = '\357';
                out1ptr[1] = '\277';
                out1ptr[2] = '\275';
                out1ptr += 3;
              }
            else
              *out1ptr++ = '?';
            res1 = 0;
          }
        int errno1 = errno;
        utf8len = out1ptr - utf8buf;

        if (offsets != NULL
            || in1size == 0
            || utf8len > utf8bufsize / 2
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
          {
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
            const char *in2ptr = utf8buf;
            size_t in2size = utf8len;

            while (in2size > 0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
              {
                char *out2ptr = result + length;
                size_t out2size = allocated - extra_alloc - length;

                bool incremented2;
                size_t res2;
                if (in2size > 0)
                  {
                    if (cd2 != (iconv_t)(-1))
                      res2 = iconv_carefully (cd2,
                                              &in2ptr, &in2size,
                                              &out2ptr, &out2size,
                                              &incremented2);
                    else
                      /* TO_CODESET is UTF-8.  */
                      res2 = utf8conv_carefully (false,
                                                 &in2ptr, &in2size,
                                                 &out2ptr, &out2size,
                                                 &incremented2);
                  }
                else /* in1size == 0 && !do_final_flush1
                        && in2size == 0 && do_final_flush2 */
                  {
                    /* Now get the conversion state of CD1 back to the initial
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
                    if (cd2 != (iconv_t)(-1))
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
                    else
# endif
                      res2 = 0;
                    do_final_flush2 = false;
                    incremented2 = true;
                  }

                length = out2ptr - result;
                bool grow = (length + extra_alloc > allocated / 2);
                if (res2 == (size_t)(-1))
                  {
                    if (errno == E2BIG)
                      grow = true;
                    else if (errno == EINVAL)
                      break;
                    else if (errno == EILSEQ && handler != iconveh_error)
                      {
                        /* Error handling can produce up to 10 bytes of UTF-8
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
                           UCS-4, so use CD2 here as well.  */
                        ucs4_t uc;

                        if (incremented2)
                          {
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
                                         (const uint8_t *) utf8buf)
                                == NULL)
                              abort ();
                          }
                        else
                          {
                            int n;
                            if (in2size == 0)
                              abort ();
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
                                                  in2size);
                            in2ptr += n;
                            in2size -= n;
                          }

                        char scratchbuf[10];
                        size_t scratchlen;
                        if (handler == iconveh_escape_sequence)
                          {
                            static char const hex[16] _GL_ATTRIBUTE_NONSTRING =
                              "0123456789ABCDEF";
                            scratchlen = 0;
                            scratchbuf[scratchlen++] = '\\';
                            if (uc < 0x10000)
                              scratchbuf[scratchlen++] = 'u';
                            else
                              {
                                scratchbuf[scratchlen++] = 'U';
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
                              }
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
                            scratchbuf[scratchlen++] = hex[uc & 15];
                          }
                        else if (handler == iconveh_replacement_character)
                          {
                            /* U+FFFD in UTF-8 encoding.  */
                            scratchbuf[0] = '\357';
                            scratchbuf[1] = '\277';
                            scratchbuf[2] = '\275';
                            scratchlen = 3;
                          }
                        else
                          {
                            scratchbuf[0] = '?';
                            scratchlen = 1;
                          }

                        const char *inptr = scratchbuf;
                        size_t insize = scratchlen;
                        size_t res;
                        if (cd2 != (iconv_t)(-1))
                          {
                            char *out2ptr_try = out2ptr;
                            size_t out2size_try = out2size;
                            res = iconv (cd2,
                                         (ICONV_CONST char **) &inptr, &insize,
                                         &out2ptr_try, &out2size_try);
                            if (handler == iconveh_replacement_character
                                && (res == (size_t)(-1)
                                    ? errno == EILSEQ
                                    /* FreeBSD iconv(), NetBSD iconv(), and
                                       Solaris 11 iconv() insert a '?' if they
                                       cannot convert.  This is what we want.
                                       But musl libc iconv() inserts a '*' if it
                                       cannot convert.  */
                                    : (res > 0
                                       && !(out2ptr_try - out2ptr == 1
                                            && *out2ptr == '?'))))
                              {
                                /* The iconv() call failed.
                                   U+FFFD can't be converted to TO_CODESET.
                                   Use '?' instead.  */
                                scratchbuf[0] = '?';
                                scratchlen = 1;
                                inptr = scratchbuf;
                                insize = scratchlen;
                                res = iconv (cd2,
                                             (ICONV_CONST char **) &inptr, &insize,
                                             &out2ptr, &out2size);
                              }
                            else
                              {
                                /* Accept the results of the iconv() call.  */
                                out2ptr = out2ptr_try;
                                out2size = out2size_try;
                                res = 0;
                              }
                          }
                        else
                          {
                            /* TO_CODESET is UTF-8.  */
                            if (out2size >= insize)
                              {
                                memcpy (out2ptr, inptr, insize);
                                out2ptr += insize;
                                out2size -= insize;
                                inptr += insize;
                                insize = 0;
                                res = 0;
                              }
                            else
                              {
                                errno = E2BIG;
                                res = (size_t)(-1);
                              }
                          }
                        length = out2ptr - result;
                        if (res == (size_t)(-1) && errno == E2BIG)
                          {
                            allocated = 2 * allocated;
                            if (length + 1 + extra_alloc > allocated)
                              abort ();
                            char *memory;
                            if (result == initial_result)
                              memory = (char *) malloc (allocated);
                            else
                              memory = (char *) realloc (result, allocated);
                            if (memory == NULL)
                              {
                                if (result != initial_result)
                                  free (result);
                                errno = ENOMEM;
                                return -1;
                              }
                            if (result == initial_result)
                              memcpy (memory, initial_result, length);
                            result = memory;
                            grow = false;

                            out2ptr = result + length;
                            out2size = allocated - extra_alloc - length;
                            if (cd2 != (iconv_t)(-1))
                              res = iconv (cd2,
                                           (ICONV_CONST char **) &inptr,
                                           &insize,
                                           &out2ptr, &out2size);
                            else
                              {
                                /* TO_CODESET is UTF-8.  */
                                if (!(out2size >= insize))
                                  abort ();
                                memcpy (out2ptr, inptr, insize);
                                out2ptr += insize;
                                out2size -= insize;
                                inptr += insize;
                                insize = 0;
                                res = 0;
                              }
                            length = out2ptr - result;
                          }
# if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
     && !(defined __GLIBC__ && !defined __UCLIBC__)
                        /* FreeBSD iconv(), NetBSD iconv(), and Solaris 11
                           iconv() insert a '?' if they cannot convert.
                           musl libc iconv() inserts a '*' if it cannot convert.
                           Only GNU libiconv (excluding the bastard Apple iconv)
                           and GNU libc are known to prefer to fail rather than
                           doing a lossy conversion.  */
                        if (res != (size_t)(-1) && res > 0)
                          {
                            errno = EILSEQ;
                            res = (size_t)(-1);
                          }
# endif
                        if (res == (size_t)(-1))
                          {
                            /* Failure converting the ASCII replacement.  */
                            if (result != initial_result)
                              free (result);
                            return -1;
                          }
                      }
                    else
                      {
                        if (result != initial_result)
                          free (result);
                        return -1;
                      }
                  }
                if (!(in2size > 0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
                  break;
                if (grow)
                  {
                    allocated = 2 * allocated;
                    char *memory;
                    if (result == initial_result)
                      memory = (char *) malloc (allocated);
                    else
                      memory = (char *) realloc (result, allocated);
                    if (memory == NULL)
                      {
                        if (result != initial_result)
                          free (result);
                        errno = ENOMEM;
                        return -1;
                      }
                    if (result == initial_result)
                      memcpy (memory, initial_result, length);
                    result = memory;
                  }
              }

            /* Move the remaining bytes to the beginning of utf8buf.  */
            if (in2size > 0)
              memmove (utf8buf, in2ptr, in2size);
            utf8len = in2size;
          }

        if (res1 == (size_t)(-1))
          {
            if (errno1 == EINVAL)
              in1size = 0;
            else if (errno1 == EILSEQ)
              {
                if (result != initial_result)
                  free (result);
                errno = errno1;
                return -1;
              }
          }
      }
# undef utf8bufsize
  }

 done:
  /* Now the final memory allocation.  */
  if (result == tmpbuf)
    {
      size_t memsize = length + extra_alloc;

      if (*resultp != NULL && *lengthp >= memsize)
        result = *resultp;
      else
        {
          char *memory = (char *) malloc (memsize > 0 ? memsize : 1);
          if (memory != NULL)
            result = memory;
          else
            {
              errno = ENOMEM;
              return -1;
            }
        }
      memcpy (result, tmpbuf, length);
    }
  else if (result != *resultp && length + extra_alloc < allocated)
    {
      /* Shrink the allocated memory if possible.  */
      size_t memsize = length + extra_alloc;
      char *memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
      if (memory != NULL)
        result = memory;
    }
  *resultp = result;
  *lengthp = length;
  return 0;
# undef tmpbuf
# undef tmpbufsize
}

int
mem_cd_iconveh (const char *src, size_t srclen,
                const iconveh_t *cd,
                enum iconv_ilseq_handler handler,
                size_t *offsets,
                char **resultp, size_t *lengthp)
{
  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
                                  handler, 0, offsets, resultp, lengthp);
}

char *
str_cd_iconveh (const char *src,
                const iconveh_t *cd,
                enum iconv_ilseq_handler handler)
{
  /* For most encodings, a trailing NUL byte in the input will be converted
     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
     function is usable for UTF-7, we have to exclude the NUL byte from the
     conversion and add it by hand afterwards.  */
  char *result = NULL;
  size_t length = 0;
  int retval = mem_cd_iconveh_internal (src, strlen (src),
                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
                                        NULL, &result, &length);

  if (retval < 0)
    {
      free (result);
      return NULL;
    }

  /* Add the terminating NUL byte.  */
  result[length] = '\0';

  return result;
}

#endif

int
mem_iconveh (const char *src, size_t srclen,
             const char *from_codeset, const char *to_codeset,
             enum iconv_ilseq_handler handler,
             size_t *offsets,
             char **resultp, size_t *lengthp)
{
  if (srclen == 0)
    {
      /* Nothing to convert.  */
      *lengthp = 0;
      return 0;
    }
  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
    {
      char *result;

      if (*resultp != NULL && *lengthp >= srclen)
        result = *resultp;
      else
        {
          result = (char *) malloc (srclen);
          if (result == NULL)
            {
              errno = ENOMEM;
              return -1;
            }
        }
      memcpy (result, src, srclen);
      *resultp = result;
      *lengthp = srclen;
      return 0;
    }
  else
    {
#if HAVE_ICONV
      iconveh_t cd;
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
        return -1;

      char *result = *resultp;
      size_t length = *lengthp;
      int retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
                                   &result, &length);

      if (retval < 0)
        {
          /* Close cd, but preserve the errno from str_cd_iconv.  */
          int saved_errno = errno;
          iconveh_close (&cd);
          errno = saved_errno;
        }
      else
        {
          if (iconveh_close (&cd) < 0)
            {
              if (result != *resultp)
                free (result);
              return -1;
            }
          *resultp = result;
          *lengthp = length;
        }
      return retval;
#else
      /* This is a different error code than if iconv_open existed but didn't
         support from_codeset and to_codeset, so that the caller can emit
         an error message such as
           "iconv() is not supported. Installing GNU libiconv and
            then reinstalling this package would fix this."  */
      errno = ENOSYS;
      return -1;
#endif
    }
}

char *
str_iconveh (const char *src,
             const char *from_codeset, const char *to_codeset,
             enum iconv_ilseq_handler handler)
{
  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
    {
      char *result = strdup (src);

      if (result == NULL)
        errno = ENOMEM;
      return result;
    }
  else
    {
#if HAVE_ICONV
      iconveh_t cd;
      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
        return NULL;

      char *result = str_cd_iconveh (src, &cd, handler);

      if (result == NULL)
        {
          /* Close cd, but preserve the errno from str_cd_iconv.  */
          int saved_errno = errno;
          iconveh_close (&cd);
          errno = saved_errno;
        }
      else
        {
          if (iconveh_close (&cd) < 0)
            {
              free (result);
              return NULL;
            }
        }
      return result;
#else
      /* This is a different error code than if iconv_open existed but didn't
         support from_codeset and to_codeset, so that the caller can emit
         an error message such as
           "iconv() is not supported. Installing GNU libiconv and
            then reinstalling this package would fix this."  */
      errno = ENOSYS;
      return NULL;
#endif
    }
}

Coverage Report

Created: 2026-02-05 06:23