/src/logging-log4cxx/src/main/cpp/charsetdecoder.cpp

Source
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#define NOMINMAX /* tell windows not to define min/max macros */
#include <log4cxx/private/string_c11.h>
#include <log4cxx/logstring.h>
#include <log4cxx/helpers/charsetdecoder.h>
#include <log4cxx/helpers/bytebuffer.h>
#include <log4cxx/helpers/exception.h>
#include <log4cxx/helpers/pool.h>
#include <log4cxx/helpers/loglog.h>
#include <apr_xlate.h>
#if !defined(LOG4CXX)
  #define LOG4CXX 1
#endif
#include <log4cxx/private/log4cxx_private.h>
#include <locale.h>
#include <apr_portable.h>
#include <log4cxx/helpers/stringhelper.h>
#include <log4cxx/helpers/transcoder.h>
#include <mutex>

using namespace LOG4CXX_NS;
using namespace LOG4CXX_NS::helpers;

IMPLEMENT_LOG4CXX_OBJECT(CharsetDecoder)


namespace LOG4CXX_NS
{
namespace helpers
{

#if APR_HAS_XLATE
/**
 *  Converts from an arbitrary encoding to LogString
 *    using apr_xlate.  Requires real iconv implementation,
*    apr-iconv will crash in use.
 */
class APRCharsetDecoder : public CharsetDecoder
{
  public:
    /**
     *  Creates a new instance.
     *  @param frompage name of source encoding.
     */
    APRCharsetDecoder(const LogString& frompage) : pool()
    {
#if LOG4CXX_LOGCHAR_IS_WCHAR
      const char* topage = "WCHAR_T";
#endif
#if LOG4CXX_LOGCHAR_IS_UTF8
      const char* topage = "UTF-8";
#endif
#if LOG4CXX_LOGCHAR_IS_UNICHAR
      const char* topage = "UTF-16";
#endif
      std::string fpage(Transcoder::encodeCharsetName(frompage));
      apr_status_t stat = apr_xlate_open(&convset,
          topage,
          fpage.c_str(),
          pool.getAPRPool());

      if (stat != APR_SUCCESS)
      {
        throw IllegalArgumentException(frompage);
      }
    }

    /**
     *  Destructor.
     */
    virtual ~APRCharsetDecoder()
    {
    }

    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      enum { BUFSIZE = 256 };
      logchar buf[BUFSIZE];
      const apr_size_t initial_outbytes_left = BUFSIZE * sizeof(logchar);
      apr_status_t stat = APR_SUCCESS;

      if (in.remaining() == 0)
      {
        size_t outbytes_left = initial_outbytes_left;
        {
          std::lock_guard<std::mutex> lock(mutex);
          stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
              NULL, NULL, (char*) buf, &outbytes_left);
        }
        out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
      }
      else
      {
        while (in.remaining() > 0 && stat == APR_SUCCESS)
        {
          size_t inbytes_left = in.remaining();
          size_t initial_inbytes_left = inbytes_left;
          apr_size_t outbytes_left = initial_outbytes_left;
          {
            std::lock_guard<std::mutex> lock(mutex);
            stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
                in.current(),
                &inbytes_left,
                (char*) buf,
                &outbytes_left);
          }
          out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
          if (inbytes_left == initial_inbytes_left && stat == APR_SUCCESS)
          {
            stat = APR_BADCH;
            break;
          }
          in.increment_position(initial_inbytes_left - inbytes_left);
        }
      }

      return stat;
    }

  private:
    APRCharsetDecoder(const APRCharsetDecoder&);
    APRCharsetDecoder& operator=(const APRCharsetDecoder&);
    LOG4CXX_NS::helpers::Pool pool;
    std::mutex mutex;
    apr_xlate_t* convset;
};

#endif

#if LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
/**
*    Converts from the default multi-byte string to
*        LogString using mbstowcs.
*
*/
class MbstowcsCharsetDecoder : public CharsetDecoder
{
  public:
    MbstowcsCharsetDecoder()
    {
    }

    virtual ~MbstowcsCharsetDecoder()
    {
    }

  private:
    inline log4cxx_status_t append(LogString& out, const wchar_t* buf)
    {
      out.append(buf);
      return APR_SUCCESS;
    }

    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      log4cxx_status_t stat = APR_SUCCESS;
      enum { BUFSIZE = 256 };
      wchar_t wbuf[BUFSIZE];
      char cbuf[BUFSIZE*4];

      mbstate_t mbstate;
      memset(&mbstate, 0, sizeof(mbstate));

      while (in.remaining() > 0)
      {
        const char* src = in.current();

        if (*src == 0)
        {
          out.append(1, (logchar) 0);
          in.increment_position(1);
        }
        else
        {
          auto available = std::min(sizeof (cbuf) - 1, in.remaining());
          strncpy(cbuf, src, available);
          cbuf[available] = 0;
          src = cbuf;
          size_t wCharCount = mbsrtowcs(wbuf,
              &src,
              BUFSIZE - 1,
              &mbstate);
          // mbsrtowcs sets *src to nullptr when it consumes a null wide character.
          // Performing pointer arithmetic on that nullptr (src - cbuf) is undefined
          // behaviour, so recover the consumed byte count from the position of the
          // null that stopped the conversion instead.
          size_t converted;
          if (src == nullptr)
          {
            size_t nullPos = 0;
            while (nullPos < available && cbuf[nullPos] != 0)
            {
              ++nullPos;
            }
            // If the null came from the input bytes, it was consumed too;
            // if it is the sentinel we wrote at cbuf[available], stop at available.
            converted = (nullPos < available) ? nullPos + 1 : available;
          }
          else
          {
            converted = static_cast<size_t>(src - cbuf);
          }
          in.increment_position(converted);

          if (wCharCount == (size_t) -1) // Illegal byte sequence?
          {
            LogString msg(LOG4CXX_STR("Illegal byte sequence at "));
            msg.append(std::to_wstring(in.position()));
            msg.append(LOG4CXX_STR(" of "));
            msg.append(std::to_wstring(in.limit()));
            LogLog::warn(msg);
            stat = APR_BADCH;
            break;
          }
          else
          {
            // FIX: Check for incomplete sequence infinite loop.
            // If mbsrtowcs returns success (>=0) but converted 0 bytes while data remains,
            // we are stuck (e.g. incomplete multibyte char at EOF).
            if (converted == 0 && in.remaining() > 0)
            {
              LogString msg(LOG4CXX_STR("Incomplete multibyte sequence at end of buffer"));
              LogLog::warn(msg);
              stat = APR_BADCH;
              break; // Break the infinite loop
            }

            wbuf[wCharCount] = 0;
            stat = append(out, wbuf);
          }
        }
      }

      return stat;
    }



  private:
    MbstowcsCharsetDecoder(const MbstowcsCharsetDecoder&);
    MbstowcsCharsetDecoder& operator=(const MbstowcsCharsetDecoder&);
};
#endif


/**
*    Decoder used when the external and internal charsets
*    are the same.
*
*/
class TrivialCharsetDecoder : public CharsetDecoder
{
  public:
    TrivialCharsetDecoder()
    {
    }

    virtual ~TrivialCharsetDecoder()
    {
    }

    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      size_t remaining = in.remaining();

      if ( remaining > 0)
      {
        auto src = in.current();
        auto count = remaining / sizeof(logchar);
        out.append(reinterpret_cast<const logchar*>(src), count);
        in.increment_position(remaining);
      }

      return APR_SUCCESS;
    }



  private:
    TrivialCharsetDecoder(const TrivialCharsetDecoder&);
    TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
};

/**
*    Converts from UTF-8 to LogString
*
*/
class UTF8CharsetDecoder : public CharsetDecoder
{
  public:
    UTF8CharsetDecoder()
    {
    }

    virtual ~UTF8CharsetDecoder()
    {
    }

  private:
    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      auto availableByteCount = in.remaining();
      while (0 < availableByteCount)
      {
        auto sv = getUTF8CodePoint(in);
        auto nextAvailableByteCount = in.remaining();
        if (sv == 0xFFFF || nextAvailableByteCount == availableByteCount)
          return APR_BADCH;
        Transcoder::encode(sv, out);
        availableByteCount = nextAvailableByteCount;
      }
      return APR_SUCCESS;
    }

  private:
    UTF8CharsetDecoder(const UTF8CharsetDecoder&);
    UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
};

/**
*    Converts from ISO-8859-1 to LogString.
*
*/
class ISOLatinCharsetDecoder : public CharsetDecoder
{
  public:
    ISOLatinCharsetDecoder()
    {
    }

    virtual ~ISOLatinCharsetDecoder()
    {
    }

  private:
    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      auto availableByteCount = in.remaining();
      auto src = in.current();
      auto srcEnd = src + availableByteCount;

      while (src < srcEnd)
      {
        auto sv = static_cast<unsigned int>(static_cast<unsigned char>(*src++));
        Transcoder::encode(sv, out);
      }
      in.increment_position(availableByteCount);

      return APR_SUCCESS;
    }



  private:
    ISOLatinCharsetDecoder(const ISOLatinCharsetDecoder&);
    ISOLatinCharsetDecoder& operator=(const ISOLatinCharsetDecoder&);
};


/**
*    Converts from US-ASCII to LogString.
*
*/
class USASCIICharsetDecoder : public CharsetDecoder
{
  public:
    USASCIICharsetDecoder()
    {
    }

    virtual ~USASCIICharsetDecoder()
    {
    }

  private:

    virtual log4cxx_status_t decode(ByteBuffer& in,
      LogString& out)
    {
      log4cxx_status_t stat = APR_SUCCESS;

      auto availableByteCount = in.remaining();
      auto src = in.current();
      auto srcEnd = src + availableByteCount;
      size_t byteCount = 0;
      while (src < srcEnd)
      {
        auto sv = static_cast<unsigned int>(*src++);

        if (sv < 0x80)
        {
          ++byteCount;
          Transcoder::encode(sv, out);
        }
        else
        {
          stat = APR_BADCH;
          break;
        }
      }
      in.increment_position(byteCount);

      return stat;
    }



  private:
    USASCIICharsetDecoder(const USASCIICharsetDecoder&);
    USASCIICharsetDecoder& operator=(const USASCIICharsetDecoder&);
};

/**
 *    Charset decoder that uses current locale settings.
 */
class LocaleCharsetDecoder : public CharsetDecoder
{
  public:
    LocaleCharsetDecoder() : state()
    {
    }
    log4cxx_status_t decode(ByteBuffer& in, LogString& out) override
    {
      log4cxx_status_t result = APR_SUCCESS;
      auto p = in.current();
      auto availableByteCount = in.remaining();
      size_t byteCount = 0;
#if !LOG4CXX_CHARSET_EBCDIC
      if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded?
      {
        // Copy single byte characters
        for (; byteCount < availableByteCount && static_cast<unsigned int>(*p) < 0x80; ++byteCount, ++p)
        {
          out.append(1, *p);
        }
      }
#endif
      // Decode characters that may be represented by multiple bytes
      while (byteCount < availableByteCount)
      {
        wchar_t ch = 0;
        size_t n = std::mbrtowc(&ch, p, availableByteCount - byteCount, &this->state);
        if (0 == n) // NULL encountered?
        {
          ++byteCount;
          break;
        }
        if (static_cast<std::size_t>(-1) == n) // decoding error?
        {
          result = APR_BADCH;
          break;
        }
        if (static_cast<std::size_t>(-2) == n) // incomplete sequence?
        {
          break;
        }
        Transcoder::encode(static_cast<unsigned int>(ch), out);
        byteCount += n;
        p += n;
      }
      in.increment_position(byteCount);
      return result;
    }

  private:
    std::mbstate_t state;
};



} // namespace helpers

}  //namespace log4cxx


CharsetDecoder::CharsetDecoder()
{
}


CharsetDecoder::~CharsetDecoder()
{
}

CharsetDecoder* CharsetDecoder::createDefaultDecoder()
{
#if LOG4CXX_CHARSET_UTF8
#if LOG4CXX_LOGCHAR_IS_UTF8
  return new TrivialCharsetDecoder();
#else
  return new UTF8CharsetDecoder();
#endif
#elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE)
  return new ISOLatinCharsetDecoder();
#elif LOG4CXX_CHARSET_USASCII
  return new USASCIICharsetDecoder();
#elif LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
  return new MbstowcsCharsetDecoder();
#else
  return new LocaleCharsetDecoder();
#endif
}

CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
{
  static WideLife<CharsetDecoderPtr> decoder(createDefaultDecoder());

  //
  //  if invoked after static variable destruction
  //     (if logging is called in the destructor of a static object)
  //     then create a new decoder.
  //
  if (decoder.value() == 0)
  {
    return CharsetDecoderPtr( createDefaultDecoder() );
  }

  return decoder;
}

CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
{
  return std::make_shared<UTF8CharsetDecoder>();
}

CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
{
  return std::make_shared<ISOLatinCharsetDecoder>();
}


CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset)
{
  if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
  {
#if LOG4CXX_LOGCHAR_IS_UTF8
    return std::make_shared<TrivialCharsetDecoder>();
#else
    return std::make_shared<UTF8CharsetDecoder>();
#endif
  }
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) ||
    charset == LOG4CXX_STR("646") ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127")))
  {
    return std::make_shared<USASCIICharsetDecoder>();
  }
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) ||
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252")))
  {
    return std::make_shared<ISOLatinCharsetDecoder>();
  }
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale")))
  {
    return std::make_shared<LocaleCharsetDecoder>();
  }

#if APR_HAS_XLATE
  return std::make_shared<APRCharsetDecoder>(charset);
#else
  throw IllegalArgumentException(charset);
#endif
}

log4cxx_status_t CharsetDecoder::decode(const char* in, size_t maxByteCount, LogString& out)
{
  ByteBuffer buf((char*)in, strnlen_s(in, maxByteCount));
  return decode(buf, out);
}

unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
{
  auto availableByteCount = in.remaining();
  if (0 == availableByteCount)
    return 0xFFFF;

  auto pChar = in.current();
  auto ch1 = static_cast<unsigned char>(*pChar);
  if (ch1 <= 0x7F)
  {
    in.increment_position(1);
    return ch1;
  }

  //
  //   should not have continuation character here
  //
  if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
  {
    auto ch2 = static_cast<unsigned char>(*(pChar + 1));
    if ((ch2 & 0xC0) != 0x80) // not a continuation?
      return 0xFFFF;

    if ((ch1 & 0xE0) == 0xC0)
    {
      unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
      if (rv >= 0x80)
      {
        in.increment_position(2);
        return rv;
      }
      return 0xFFFF;
    }

    if (2 < availableByteCount)
    {
      auto ch3 = static_cast<unsigned char>(*(pChar + 2));
      if ((ch3 & 0xC0) != 0x80) // not a continuation?
        return 0xFFFF;

      if ((ch1 & 0xF0) == 0xE0)
      {
        unsigned int rv = ((ch1 & 0x0F) << 12)
          + ((ch2 & 0x3F) << 6)
          + (ch3 & 0x3F);

        // RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate
        // halves (U+D800..U+DFFF); accepting them lets malformed Unicode
        // cross the decode boundary into LogString and downstream output.
        if (rv < 0x800 || (0xD800 <= rv && rv <= 0xDFFF))
          return 0xFFFF;

        in.increment_position(3);
        return rv;
      }

      if (3 < availableByteCount)
      {
        auto ch4 = static_cast<unsigned char>(*(pChar + 3));
        if ((ch4 & 0xC0) != 0x80) // not a continuation?
          return 0xFFFF;

        unsigned int rv = ((ch1 & 0x07) << 18)
          + ((ch2 & 0x3F) << 12)
          + ((ch3 & 0x3F) << 6)
          + (ch4 & 0x3F);

        // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and
        // F4 with an over-high trailer) produce rv > 0x10FFFF, which
        // is not a Unicode code point. Without this bound, encodeUTF16
        // later silently aliases the bogus value to a valid in-range
        // code point — a substitution-collision filter-bypass primitive.
        // Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask
        // discards their high bits, so without the (ch1 & 0xF8) == 0xF0
        // guard F8 BF BF BF would alias to U+3FFFF instead of being
        // rejected.
        if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF)
        {
          in.increment_position(4);
          return rv;
        }

      }
    }
  }
  return 0xFFFF;
}

Coverage Report

Created: 2026-06-15 06:22

Line	Count	Source
1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17		#define NOMINMAX /* tell windows not to define min/max macros */
18		#include <log4cxx/private/string_c11.h>
19		#include <log4cxx/logstring.h>
20		#include <log4cxx/helpers/charsetdecoder.h>
21		#include <log4cxx/helpers/bytebuffer.h>
22		#include <log4cxx/helpers/exception.h>
23		#include <log4cxx/helpers/pool.h>
24		#include <log4cxx/helpers/loglog.h>
25		#include <apr_xlate.h>
26		#if !defined(LOG4CXX)
27		#define LOG4CXX 1
28		#endif
29		#include <log4cxx/private/log4cxx_private.h>
30		#include <locale.h>
31		#include <apr_portable.h>
32		#include <log4cxx/helpers/stringhelper.h>
33		#include <log4cxx/helpers/transcoder.h>
34		#include <mutex>
35
36		using namespace LOG4CXX_NS;
37		using namespace LOG4CXX_NS::helpers;
38
39		IMPLEMENT_LOG4CXX_OBJECT(CharsetDecoder)
40
41
42		namespace LOG4CXX_NS
43		{
44		namespace helpers
45		{
46
47		#if APR_HAS_XLATE
48		/**
49		* Converts from an arbitrary encoding to LogString
50		* using apr_xlate. Requires real iconv implementation,
51		* apr-iconv will crash in use.
52		*/
53		class APRCharsetDecoder : public CharsetDecoder
54		{
55		public:
56		/**
57		* Creates a new instance.
58		* @param frompage name of source encoding.
59		*/
60	0	APRCharsetDecoder(const LogString& frompage) : pool()
61	0	{
62	0	#if LOG4CXX_LOGCHAR_IS_WCHAR
63	0	const char* topage = "WCHAR_T";
64	0	#endif
65		#if LOG4CXX_LOGCHAR_IS_UTF8
66		const char* topage = "UTF-8";
67		#endif
68		#if LOG4CXX_LOGCHAR_IS_UNICHAR
69		const char* topage = "UTF-16";
70		#endif
71	0	std::string fpage(Transcoder::encodeCharsetName(frompage));
72	0	apr_status_t stat = apr_xlate_open(&convset,
73	0	topage,
74	0	fpage.c_str(),
75	0	pool.getAPRPool());
76
77	0	if (stat != APR_SUCCESS)
78	0	{
79	0	throw IllegalArgumentException(frompage);
80	0	}
81	0	}
82
83		/**
84		* Destructor.
85		*/
86		virtual ~APRCharsetDecoder()
87	0	{
88	0	}
89
90		virtual log4cxx_status_t decode(ByteBuffer& in,
91		LogString& out)
92	0	{
93	0	enum { BUFSIZE = 256 };
94	0	logchar buf[BUFSIZE];
95	0	const apr_size_t initial_outbytes_left = BUFSIZE * sizeof(logchar);
96	0	apr_status_t stat = APR_SUCCESS;
97
98	0	if (in.remaining() == 0)
99	0	{
100	0	size_t outbytes_left = initial_outbytes_left;
101	0	{
102	0	std::lock_guard<std::mutex> lock(mutex);
103	0	stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
104	0	NULL, NULL, (char*) buf, &outbytes_left);
105	0	}
106	0	out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
107	0	}
108	0	else
109	0	{
110	0	while (in.remaining() > 0 && stat == APR_SUCCESS)
111	0	{
112	0	size_t inbytes_left = in.remaining();
113	0	size_t initial_inbytes_left = inbytes_left;
114	0	apr_size_t outbytes_left = initial_outbytes_left;
115	0	{
116	0	std::lock_guard<std::mutex> lock(mutex);
117	0	stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
118	0	in.current(),
119	0	&inbytes_left,
120	0	(char*) buf,
121	0	&outbytes_left);
122	0	}
123	0	out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
124	0	if (inbytes_left == initial_inbytes_left && stat == APR_SUCCESS)
125	0	{
126	0	stat = APR_BADCH;
127	0	break;
128	0	}
129	0	in.increment_position(initial_inbytes_left - inbytes_left);
130	0	}
131	0	}
132
133	0	return stat;
134	0	}
135
136		private:
137		APRCharsetDecoder(const APRCharsetDecoder&);
138		APRCharsetDecoder& operator=(const APRCharsetDecoder&);
139		LOG4CXX_NS::helpers::Pool pool;
140		std::mutex mutex;
141		apr_xlate_t* convset;
142		};
143
144		#endif
145
146		#if LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
147		/**
148		* Converts from the default multi-byte string to
149		* LogString using mbstowcs.
150		*
151		*/
152		class MbstowcsCharsetDecoder : public CharsetDecoder
153		{
154		public:
155		MbstowcsCharsetDecoder()
156	0	{
157	0	}
158
159		virtual ~MbstowcsCharsetDecoder()
160	0	{
161	0	}
162
163		private:
164		inline log4cxx_status_t append(LogString& out, const wchar_t* buf)
165	0	{
166	0	out.append(buf);
167	0	return APR_SUCCESS;
168	0	}
169
170		virtual log4cxx_status_t decode(ByteBuffer& in,
171		LogString& out)
172	0	{
173	0	log4cxx_status_t stat = APR_SUCCESS;
174	0	enum { BUFSIZE = 256 };
175	0	wchar_t wbuf[BUFSIZE];
176	0	char cbuf[BUFSIZE*4];
177	0
178	0	mbstate_t mbstate;
179	0	memset(&mbstate, 0, sizeof(mbstate));
180	0
181	0	while (in.remaining() > 0)
182	0	{
183	0	const char* src = in.current();
184	0
185	0	if (*src == 0)
186	0	{
187	0	out.append(1, (logchar) 0);
188	0	in.increment_position(1);
189	0	}
190	0	else
191	0	{
192	0	auto available = std::min(sizeof (cbuf) - 1, in.remaining());
193	0	strncpy(cbuf, src, available);
194	0	cbuf[available] = 0;
195	0	src = cbuf;
196	0	size_t wCharCount = mbsrtowcs(wbuf,
197	0	&src,
198	0	BUFSIZE - 1,
199	0	&mbstate);
200	0	// mbsrtowcs sets *src to nullptr when it consumes a null wide character.
201	0	// Performing pointer arithmetic on that nullptr (src - cbuf) is undefined
202	0	// behaviour, so recover the consumed byte count from the position of the
203	0	// null that stopped the conversion instead.
204	0	size_t converted;
205	0	if (src == nullptr)
206	0	{
207	0	size_t nullPos = 0;
208	0	while (nullPos < available && cbuf[nullPos] != 0)
209	0	{
210	0	++nullPos;
211	0	}
212	0	// If the null came from the input bytes, it was consumed too;
213	0	// if it is the sentinel we wrote at cbuf[available], stop at available.
214	0	converted = (nullPos < available) ? nullPos + 1 : available;
215	0	}
216	0	else
217	0	{
218	0	converted = static_cast<size_t>(src - cbuf);
219	0	}
220	0	in.increment_position(converted);
221	0
222	0	if (wCharCount == (size_t) -1) // Illegal byte sequence?
223	0	{
224	0	LogString msg(LOG4CXX_STR("Illegal byte sequence at "));
225	0	msg.append(std::to_wstring(in.position()));
226	0	msg.append(LOG4CXX_STR(" of "));
227	0	msg.append(std::to_wstring(in.limit()));
228	0	LogLog::warn(msg);
229	0	stat = APR_BADCH;
230	0	break;
231	0	}
232	0	else
233	0	{
234	0	// FIX: Check for incomplete sequence infinite loop.
235	0	// If mbsrtowcs returns success (>=0) but converted 0 bytes while data remains,
236	0	// we are stuck (e.g. incomplete multibyte char at EOF).
237	0	if (converted == 0 && in.remaining() > 0)
238	0	{
239	0	LogString msg(LOG4CXX_STR("Incomplete multibyte sequence at end of buffer"));
240	0	LogLog::warn(msg);
241	0	stat = APR_BADCH;
242	0	break; // Break the infinite loop
243	0	}
244	0
245	0	wbuf[wCharCount] = 0;
246	0	stat = append(out, wbuf);
247	0	}
248	0	}
249	0	}
250	0
251	0	return stat;
252	0	}
253
254
255
256		private:
257		MbstowcsCharsetDecoder(const MbstowcsCharsetDecoder&);
258		MbstowcsCharsetDecoder& operator=(const MbstowcsCharsetDecoder&);
259		};
260		#endif
261
262
263		/**
264		* Decoder used when the external and internal charsets
265		* are the same.
266		*
267		*/
268		class TrivialCharsetDecoder : public CharsetDecoder
269		{
270		public:
271		TrivialCharsetDecoder()
272	0	{
273	0	}
274
275		virtual ~TrivialCharsetDecoder()
276	0	{
277	0	}
278
279		virtual log4cxx_status_t decode(ByteBuffer& in,
280		LogString& out)
281	0	{
282	0	size_t remaining = in.remaining();
283	0
284	0	if ( remaining > 0)
285	0	{
286	0	auto src = in.current();
287	0	auto count = remaining / sizeof(logchar);
288	0	out.append(reinterpret_cast<const logchar*>(src), count);
289	0	in.increment_position(remaining);
290	0	}
291	0
292	0	return APR_SUCCESS;
293	0	}
294
295
296
297		private:
298		TrivialCharsetDecoder(const TrivialCharsetDecoder&);
299		TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
300		};
301
302		/**
303		* Converts from UTF-8 to LogString
304		*
305		*/
306		class UTF8CharsetDecoder : public CharsetDecoder
307		{
308		public:
309		UTF8CharsetDecoder()
310	1	{
311	1	}
312
313		virtual ~UTF8CharsetDecoder()
314	0	{
315	0	}
316
317		private:
318		virtual log4cxx_status_t decode(ByteBuffer& in,
319		LogString& out)
320	22.4M	{
321	22.4M	auto availableByteCount = in.remaining();
322	72.7M	while (0 < availableByteCount)
323	72.7M	{
324	72.7M	auto sv = getUTF8CodePoint(in);
325	72.7M	auto nextAvailableByteCount = in.remaining();
326	72.7M	if (sv == 0xFFFF \|\| nextAvailableByteCount == availableByteCount)
327	22.4M	return APR_BADCH;
328	50.3M	Transcoder::encode(sv, out);
329	50.3M	availableByteCount = nextAvailableByteCount;
330	50.3M	}
331	1.95k	return APR_SUCCESS;
332	22.4M	}
333
334		private:
335		UTF8CharsetDecoder(const UTF8CharsetDecoder&);
336		UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
337		};
338
339		/**
340		* Converts from ISO-8859-1 to LogString.
341		*
342		*/
343		class ISOLatinCharsetDecoder : public CharsetDecoder
344		{
345		public:
346		ISOLatinCharsetDecoder()
347	0	{
348	0	}
349
350		virtual ~ISOLatinCharsetDecoder()
351	0	{
352	0	}
353
354		private:
355		virtual log4cxx_status_t decode(ByteBuffer& in,
356		LogString& out)
357	0	{
358	0	auto availableByteCount = in.remaining();
359	0	auto src = in.current();
360	0	auto srcEnd = src + availableByteCount;
361
362	0	while (src < srcEnd)
363	0	{
364	0	auto sv = static_cast<unsigned int>(static_cast<unsigned char>(*src++));
365	0	Transcoder::encode(sv, out);
366	0	}
367	0	in.increment_position(availableByteCount);
368
369	0	return APR_SUCCESS;
370	0	}
371
372
373
374		private:
375		ISOLatinCharsetDecoder(const ISOLatinCharsetDecoder&);
376		ISOLatinCharsetDecoder& operator=(const ISOLatinCharsetDecoder&);
377		};
378
379
380		/**
381		* Converts from US-ASCII to LogString.
382		*
383		*/
384		class USASCIICharsetDecoder : public CharsetDecoder
385		{
386		public:
387		USASCIICharsetDecoder()
388	0	{
389	0	}
390
391		virtual ~USASCIICharsetDecoder()
392	0	{
393	0	}
394
395		private:
396
397		virtual log4cxx_status_t decode(ByteBuffer& in,
398		LogString& out)
399	0	{
400	0	log4cxx_status_t stat = APR_SUCCESS;
401
402	0	auto availableByteCount = in.remaining();
403	0	auto src = in.current();
404	0	auto srcEnd = src + availableByteCount;
405	0	size_t byteCount = 0;
406	0	while (src < srcEnd)
407	0	{
408	0	auto sv = static_cast<unsigned int>(*src++);
409
410	0	if (sv < 0x80)
411	0	{
412	0	++byteCount;
413	0	Transcoder::encode(sv, out);
414	0	}
415	0	else
416	0	{
417	0	stat = APR_BADCH;
418	0	break;
419	0	}
420	0	}
421	0	in.increment_position(byteCount);
422
423	0	return stat;
424	0	}
425
426
427
428		private:
429		USASCIICharsetDecoder(const USASCIICharsetDecoder&);
430		USASCIICharsetDecoder& operator=(const USASCIICharsetDecoder&);
431		};
432
433		/**
434		* Charset decoder that uses current locale settings.
435		*/
436		class LocaleCharsetDecoder : public CharsetDecoder
437		{
438		public:
439	0	LocaleCharsetDecoder() : state()
440	0	{
441	0	}
442		log4cxx_status_t decode(ByteBuffer& in, LogString& out) override
443	0	{
444	0	log4cxx_status_t result = APR_SUCCESS;
445	0	auto p = in.current();
446	0	auto availableByteCount = in.remaining();
447	0	size_t byteCount = 0;
448	0	#if !LOG4CXX_CHARSET_EBCDIC
449	0	if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded?
450	0	{
451		// Copy single byte characters
452	0	for (; byteCount < availableByteCount && static_cast<unsigned int>(*p) < 0x80; ++byteCount, ++p)
453	0	{
454	0	out.append(1, *p);
455	0	}
456	0	}
457	0	#endif
458		// Decode characters that may be represented by multiple bytes
459	0	while (byteCount < availableByteCount)
460	0	{
461	0	wchar_t ch = 0;
462	0	size_t n = std::mbrtowc(&ch, p, availableByteCount - byteCount, &this->state);
463	0	if (0 == n) // NULL encountered?
464	0	{
465	0	++byteCount;
466	0	break;
467	0	}
468	0	if (static_cast<std::size_t>(-1) == n) // decoding error?
469	0	{
470	0	result = APR_BADCH;
471	0	break;
472	0	}
473	0	if (static_cast<std::size_t>(-2) == n) // incomplete sequence?
474	0	{
475	0	break;
476	0	}
477	0	Transcoder::encode(static_cast<unsigned int>(ch), out);
478	0	byteCount += n;
479	0	p += n;
480	0	}
481	0	in.increment_position(byteCount);
482	0	return result;
483	0	}
484
485		private:
486		std::mbstate_t state;
487		};
488
489
490
491		} // namespace helpers
492
493		} //namespace log4cxx
494
495
496		CharsetDecoder::CharsetDecoder()
497	1	{
498	1	}
499
500
501		CharsetDecoder::~CharsetDecoder()
502	1	{
503	1	}
504
505		CharsetDecoder* CharsetDecoder::createDefaultDecoder()
506	1	{
507	1	#if LOG4CXX_CHARSET_UTF8
508		#if LOG4CXX_LOGCHAR_IS_UTF8
509		return new TrivialCharsetDecoder();
510		#else
511	1	return new UTF8CharsetDecoder();
512	1	#endif
513		#elif LOG4CXX_CHARSET_ISO88591 \|\| defined(_WIN32_WCE)
514		return new ISOLatinCharsetDecoder();
515		#elif LOG4CXX_CHARSET_USASCII
516		return new USASCIICharsetDecoder();
517		#elif LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
518		return new MbstowcsCharsetDecoder();
519		#else
520		return new LocaleCharsetDecoder();
521		#endif
522	1	}
523
524		CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
525	1	{
526	1	static WideLife<CharsetDecoderPtr> decoder(createDefaultDecoder());
527
528		//
529		// if invoked after static variable destruction
530		// (if logging is called in the destructor of a static object)
531		// then create a new decoder.
532		//
533	1	if (decoder.value() == 0)
534	0	{
535	0	return CharsetDecoderPtr( createDefaultDecoder() );
536	0	}
537
538	1	return decoder;
539	1	}
540
541		CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
542	0	{
543	0	return std::make_shared<UTF8CharsetDecoder>();
544	0	}
545
546		CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
547	0	{
548	0	return std::make_shared<ISOLatinCharsetDecoder>();
549	0	}
550
551
552		CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset)
553	0	{
554	0	if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) \|\|
555	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) \|\|
556	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
557	0	{
558		#if LOG4CXX_LOGCHAR_IS_UTF8
559		return std::make_shared<TrivialCharsetDecoder>();
560		#else
561	0	return std::make_shared<UTF8CharsetDecoder>();
562	0	#endif
563	0	}
564	0	else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) \|\|
565	0	charset == LOG4CXX_STR("646") \|\|
566	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) \|\|
567	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) \|\|
568	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) \|\|
569	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127")))
570	0	{
571	0	return std::make_shared<USASCIICharsetDecoder>();
572	0	}
573	0	else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) \|\|
574	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) \|\|
575	0	StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252")))
576	0	{
577	0	return std::make_shared<ISOLatinCharsetDecoder>();
578	0	}
579	0	else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale")))
580	0	{
581	0	return std::make_shared<LocaleCharsetDecoder>();
582	0	}
583
584	0	#if APR_HAS_XLATE
585	0	return std::make_shared<APRCharsetDecoder>(charset);
586		#else
587		throw IllegalArgumentException(charset);
588		#endif
589	0	}
590
591		log4cxx_status_t CharsetDecoder::decode(const char* in, size_t maxByteCount, LogString& out)
592	0	{
593	0	ByteBuffer buf((char*)in, strnlen_s(in, maxByteCount));
594	0	return decode(buf, out);
595	0	}
596
597		unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
598	72.7M	{
599	72.7M	auto availableByteCount = in.remaining();
600	72.7M	if (0 == availableByteCount)
601	0	return 0xFFFF;
602
603	72.7M	auto pChar = in.current();
604	72.7M	auto ch1 = static_cast<unsigned char>(*pChar);
605	72.7M	if (ch1 <= 0x7F)
606	50.3M	{
607	50.3M	in.increment_position(1);
608	50.3M	return ch1;
609	50.3M	}
610
611		//
612		// should not have continuation character here
613		//
614	22.4M	if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
615	5.62M	{
616	5.62M	auto ch2 = static_cast<unsigned char>(*(pChar + 1));
617	5.62M	if ((ch2 & 0xC0) != 0x80) // not a continuation?
618	4.83M	return 0xFFFF;
619
620	788k	if ((ch1 & 0xE0) == 0xC0)
621	3.38k	{
622	3.38k	unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
623	3.38k	if (rv >= 0x80)
624	2.45k	{
625	2.45k	in.increment_position(2);
626	2.45k	return rv;
627	2.45k	}
628	933	return 0xFFFF;
629	3.38k	}
630
631	785k	if (2 < availableByteCount)
632	785k	{
633	785k	auto ch3 = static_cast<unsigned char>(*(pChar + 2));
634	785k	if ((ch3 & 0xC0) != 0x80) // not a continuation?
635	323k	return 0xFFFF;
636
637	461k	if ((ch1 & 0xF0) == 0xE0)
638	130k	{
639	130k	unsigned int rv = ((ch1 & 0x0F) << 12)
640	130k	+ ((ch2 & 0x3F) << 6)
641	130k	+ (ch3 & 0x3F);
642
643		// RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate
644		// halves (U+D800..U+DFFF); accepting them lets malformed Unicode
645		// cross the decode boundary into LogString and downstream output.
646	130k	if (rv < 0x800 \|\| (0xD800 <= rv && rv <= 0xDFFF))
647	123k	return 0xFFFF;
648
649	7.23k	in.increment_position(3);
650	7.23k	return rv;
651	130k	}
652
653	331k	if (3 < availableByteCount)
654	330k	{
655	330k	auto ch4 = static_cast<unsigned char>(*(pChar + 3));
656	330k	if ((ch4 & 0xC0) != 0x80) // not a continuation?
657	322k	return 0xFFFF;
658
659	8.91k	unsigned int rv = ((ch1 & 0x07) << 18)
660	8.91k	+ ((ch2 & 0x3F) << 12)
661	8.91k	+ ((ch3 & 0x3F) << 6)
662	8.91k	+ (ch4 & 0x3F);
663
664		// RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and
665		// F4 with an over-high trailer) produce rv > 0x10FFFF, which
666		// is not a Unicode code point. Without this bound, encodeUTF16
667		// later silently aliases the bogus value to a valid in-range
668		// code point — a substitution-collision filter-bypass primitive.
669		// Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask
670		// discards their high bits, so without the (ch1 & 0xF8) == 0xF0
671		// guard F8 BF BF BF would alias to U+3FFFF instead of being
672		// rejected.
673	8.91k	if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF)
674	2.72k	{
675	2.72k	in.increment_position(4);
676	2.72k	return rv;
677	2.72k	}
678
679	8.91k	}
680	331k	}
681	785k	}
682	16.8M	return 0xFFFF;
683	22.4M	}