Coverage Report

Created: 2026-06-15 06:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/logging-log4cxx/src/main/cpp/charsetdecoder.cpp
Line
Count
Source
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one or more
3
 * contributor license agreements.  See the NOTICE file distributed with
4
 * this work for additional information regarding copyright ownership.
5
 * The ASF licenses this file to You under the Apache License, Version 2.0
6
 * (the "License"); you may not use this file except in compliance with
7
 * the License.  You may obtain a copy of the License at
8
 *
9
 *      http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 */
17
#define NOMINMAX /* tell windows not to define min/max macros */
18
#include <log4cxx/private/string_c11.h>
19
#include <log4cxx/logstring.h>
20
#include <log4cxx/helpers/charsetdecoder.h>
21
#include <log4cxx/helpers/bytebuffer.h>
22
#include <log4cxx/helpers/exception.h>
23
#include <log4cxx/helpers/pool.h>
24
#include <log4cxx/helpers/loglog.h>
25
#include <apr_xlate.h>
26
#if !defined(LOG4CXX)
27
  #define LOG4CXX 1
28
#endif
29
#include <log4cxx/private/log4cxx_private.h>
30
#include <locale.h>
31
#include <apr_portable.h>
32
#include <log4cxx/helpers/stringhelper.h>
33
#include <log4cxx/helpers/transcoder.h>
34
#include <mutex>
35
36
using namespace LOG4CXX_NS;
37
using namespace LOG4CXX_NS::helpers;
38
39
IMPLEMENT_LOG4CXX_OBJECT(CharsetDecoder)
40
41
42
namespace LOG4CXX_NS
43
{
44
namespace helpers
45
{
46
47
#if APR_HAS_XLATE
48
/**
49
 *  Converts from an arbitrary encoding to LogString
50
 *    using apr_xlate.  Requires real iconv implementation,
51
*    apr-iconv will crash in use.
52
 */
53
class APRCharsetDecoder : public CharsetDecoder
54
{
55
  public:
56
    /**
57
     *  Creates a new instance.
58
     *  @param frompage name of source encoding.
59
     */
60
0
    APRCharsetDecoder(const LogString& frompage) : pool()
61
0
    {
62
0
#if LOG4CXX_LOGCHAR_IS_WCHAR
63
0
      const char* topage = "WCHAR_T";
64
0
#endif
65
#if LOG4CXX_LOGCHAR_IS_UTF8
66
      const char* topage = "UTF-8";
67
#endif
68
#if LOG4CXX_LOGCHAR_IS_UNICHAR
69
      const char* topage = "UTF-16";
70
#endif
71
0
      std::string fpage(Transcoder::encodeCharsetName(frompage));
72
0
      apr_status_t stat = apr_xlate_open(&convset,
73
0
          topage,
74
0
          fpage.c_str(),
75
0
          pool.getAPRPool());
76
77
0
      if (stat != APR_SUCCESS)
78
0
      {
79
0
        throw IllegalArgumentException(frompage);
80
0
      }
81
0
    }
82
83
    /**
84
     *  Destructor.
85
     */
86
    virtual ~APRCharsetDecoder()
87
0
    {
88
0
    }
89
90
    virtual log4cxx_status_t decode(ByteBuffer& in,
91
      LogString& out)
92
0
    {
93
0
      enum { BUFSIZE = 256 };
94
0
      logchar buf[BUFSIZE];
95
0
      const apr_size_t initial_outbytes_left = BUFSIZE * sizeof(logchar);
96
0
      apr_status_t stat = APR_SUCCESS;
97
98
0
      if (in.remaining() == 0)
99
0
      {
100
0
        size_t outbytes_left = initial_outbytes_left;
101
0
        {
102
0
          std::lock_guard<std::mutex> lock(mutex);
103
0
          stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
104
0
              NULL, NULL, (char*) buf, &outbytes_left);
105
0
        }
106
0
        out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
107
0
      }
108
0
      else
109
0
      {
110
0
        while (in.remaining() > 0 && stat == APR_SUCCESS)
111
0
        {
112
0
          size_t inbytes_left = in.remaining();
113
0
          size_t initial_inbytes_left = inbytes_left;
114
0
          apr_size_t outbytes_left = initial_outbytes_left;
115
0
          {
116
0
            std::lock_guard<std::mutex> lock(mutex);
117
0
            stat = apr_xlate_conv_buffer((apr_xlate_t*) convset,
118
0
                in.current(),
119
0
                &inbytes_left,
120
0
                (char*) buf,
121
0
                &outbytes_left);
122
0
          }
123
0
          out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar));
124
0
          if (inbytes_left == initial_inbytes_left && stat == APR_SUCCESS)
125
0
          {
126
0
            stat = APR_BADCH;
127
0
            break;
128
0
          }
129
0
          in.increment_position(initial_inbytes_left - inbytes_left);
130
0
        }
131
0
      }
132
133
0
      return stat;
134
0
    }
135
136
  private:
137
    APRCharsetDecoder(const APRCharsetDecoder&);
138
    APRCharsetDecoder& operator=(const APRCharsetDecoder&);
139
    LOG4CXX_NS::helpers::Pool pool;
140
    std::mutex mutex;
141
    apr_xlate_t* convset;
142
};
143
144
#endif
145
146
#if LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
147
/**
148
*    Converts from the default multi-byte string to
149
*        LogString using mbstowcs.
150
*
151
*/
152
class MbstowcsCharsetDecoder : public CharsetDecoder
153
{
154
  public:
155
    MbstowcsCharsetDecoder()
156
0
    {
157
0
    }
158
159
    virtual ~MbstowcsCharsetDecoder()
160
0
    {
161
0
    }
162
163
  private:
164
    inline log4cxx_status_t append(LogString& out, const wchar_t* buf)
165
0
    {
166
0
      out.append(buf);
167
0
      return APR_SUCCESS;
168
0
    }
169
170
    virtual log4cxx_status_t decode(ByteBuffer& in,
171
      LogString& out)
172
0
    {
173
0
      log4cxx_status_t stat = APR_SUCCESS;
174
0
      enum { BUFSIZE = 256 };
175
0
      wchar_t wbuf[BUFSIZE];
176
0
      char cbuf[BUFSIZE*4];
177
0
178
0
      mbstate_t mbstate;
179
0
      memset(&mbstate, 0, sizeof(mbstate));
180
0
181
0
      while (in.remaining() > 0)
182
0
      {
183
0
        const char* src = in.current();
184
0
185
0
        if (*src == 0)
186
0
        {
187
0
          out.append(1, (logchar) 0);
188
0
          in.increment_position(1);
189
0
        }
190
0
        else
191
0
        {
192
0
          auto available = std::min(sizeof (cbuf) - 1, in.remaining());
193
0
          strncpy(cbuf, src, available);
194
0
          cbuf[available] = 0;
195
0
          src = cbuf;
196
0
          size_t wCharCount = mbsrtowcs(wbuf,
197
0
              &src,
198
0
              BUFSIZE - 1,
199
0
              &mbstate);
200
0
          // mbsrtowcs sets *src to nullptr when it consumes a null wide character.
201
0
          // Performing pointer arithmetic on that nullptr (src - cbuf) is undefined
202
0
          // behaviour, so recover the consumed byte count from the position of the
203
0
          // null that stopped the conversion instead.
204
0
          size_t converted;
205
0
          if (src == nullptr)
206
0
          {
207
0
            size_t nullPos = 0;
208
0
            while (nullPos < available && cbuf[nullPos] != 0)
209
0
            {
210
0
              ++nullPos;
211
0
            }
212
0
            // If the null came from the input bytes, it was consumed too;
213
0
            // if it is the sentinel we wrote at cbuf[available], stop at available.
214
0
            converted = (nullPos < available) ? nullPos + 1 : available;
215
0
          }
216
0
          else
217
0
          {
218
0
            converted = static_cast<size_t>(src - cbuf);
219
0
          }
220
0
          in.increment_position(converted);
221
0
222
0
          if (wCharCount == (size_t) -1) // Illegal byte sequence?
223
0
          {
224
0
            LogString msg(LOG4CXX_STR("Illegal byte sequence at "));
225
0
            msg.append(std::to_wstring(in.position()));
226
0
            msg.append(LOG4CXX_STR(" of "));
227
0
            msg.append(std::to_wstring(in.limit()));
228
0
            LogLog::warn(msg);
229
0
            stat = APR_BADCH;
230
0
            break;
231
0
          }
232
0
          else
233
0
          {
234
0
            // FIX: Check for incomplete sequence infinite loop.
235
0
            // If mbsrtowcs returns success (>=0) but converted 0 bytes while data remains,
236
0
            // we are stuck (e.g. incomplete multibyte char at EOF).
237
0
            if (converted == 0 && in.remaining() > 0)
238
0
            {
239
0
              LogString msg(LOG4CXX_STR("Incomplete multibyte sequence at end of buffer"));
240
0
              LogLog::warn(msg);
241
0
              stat = APR_BADCH;
242
0
              break; // Break the infinite loop
243
0
            }
244
0
245
0
            wbuf[wCharCount] = 0;
246
0
            stat = append(out, wbuf);
247
0
          }
248
0
        }
249
0
      }
250
0
251
0
      return stat;
252
0
    }
253
254
255
256
  private:
257
    MbstowcsCharsetDecoder(const MbstowcsCharsetDecoder&);
258
    MbstowcsCharsetDecoder& operator=(const MbstowcsCharsetDecoder&);
259
};
260
#endif
261
262
263
/**
264
*    Decoder used when the external and internal charsets
265
*    are the same.
266
*
267
*/
268
class TrivialCharsetDecoder : public CharsetDecoder
269
{
270
  public:
271
    TrivialCharsetDecoder()
272
0
    {
273
0
    }
274
275
    virtual ~TrivialCharsetDecoder()
276
0
    {
277
0
    }
278
279
    virtual log4cxx_status_t decode(ByteBuffer& in,
280
      LogString& out)
281
0
    {
282
0
      size_t remaining = in.remaining();
283
0
284
0
      if ( remaining > 0)
285
0
      {
286
0
        auto src = in.current();
287
0
        auto count = remaining / sizeof(logchar);
288
0
        out.append(reinterpret_cast<const logchar*>(src), count);
289
0
        in.increment_position(remaining);
290
0
      }
291
0
292
0
      return APR_SUCCESS;
293
0
    }
294
295
296
297
  private:
298
    TrivialCharsetDecoder(const TrivialCharsetDecoder&);
299
    TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
300
};
301
302
/**
303
*    Converts from UTF-8 to LogString
304
*
305
*/
306
class UTF8CharsetDecoder : public CharsetDecoder
307
{
308
  public:
309
    UTF8CharsetDecoder()
310
1
    {
311
1
    }
312
313
    virtual ~UTF8CharsetDecoder()
314
0
    {
315
0
    }
316
317
  private:
318
    virtual log4cxx_status_t decode(ByteBuffer& in,
319
      LogString& out)
320
22.4M
    {
321
22.4M
      auto availableByteCount = in.remaining();
322
72.7M
      while (0 < availableByteCount)
323
72.7M
      {
324
72.7M
        auto sv = getUTF8CodePoint(in);
325
72.7M
        auto nextAvailableByteCount = in.remaining();
326
72.7M
        if (sv == 0xFFFF || nextAvailableByteCount == availableByteCount)
327
22.4M
          return APR_BADCH;
328
50.3M
        Transcoder::encode(sv, out);
329
50.3M
        availableByteCount = nextAvailableByteCount;
330
50.3M
      }
331
1.95k
      return APR_SUCCESS;
332
22.4M
    }
333
334
  private:
335
    UTF8CharsetDecoder(const UTF8CharsetDecoder&);
336
    UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
337
};
338
339
/**
340
*    Converts from ISO-8859-1 to LogString.
341
*
342
*/
343
class ISOLatinCharsetDecoder : public CharsetDecoder
344
{
345
  public:
346
    ISOLatinCharsetDecoder()
347
0
    {
348
0
    }
349
350
    virtual ~ISOLatinCharsetDecoder()
351
0
    {
352
0
    }
353
354
  private:
355
    virtual log4cxx_status_t decode(ByteBuffer& in,
356
      LogString& out)
357
0
    {
358
0
      auto availableByteCount = in.remaining();
359
0
      auto src = in.current();
360
0
      auto srcEnd = src + availableByteCount;
361
362
0
      while (src < srcEnd)
363
0
      {
364
0
        auto sv = static_cast<unsigned int>(static_cast<unsigned char>(*src++));
365
0
        Transcoder::encode(sv, out);
366
0
      }
367
0
      in.increment_position(availableByteCount);
368
369
0
      return APR_SUCCESS;
370
0
    }
371
372
373
374
  private:
375
    ISOLatinCharsetDecoder(const ISOLatinCharsetDecoder&);
376
    ISOLatinCharsetDecoder& operator=(const ISOLatinCharsetDecoder&);
377
};
378
379
380
/**
381
*    Converts from US-ASCII to LogString.
382
*
383
*/
384
class USASCIICharsetDecoder : public CharsetDecoder
385
{
386
  public:
387
    USASCIICharsetDecoder()
388
0
    {
389
0
    }
390
391
    virtual ~USASCIICharsetDecoder()
392
0
    {
393
0
    }
394
395
  private:
396
397
    virtual log4cxx_status_t decode(ByteBuffer& in,
398
      LogString& out)
399
0
    {
400
0
      log4cxx_status_t stat = APR_SUCCESS;
401
402
0
      auto availableByteCount = in.remaining();
403
0
      auto src = in.current();
404
0
      auto srcEnd = src + availableByteCount;
405
0
      size_t byteCount = 0;
406
0
      while (src < srcEnd)
407
0
      {
408
0
        auto sv = static_cast<unsigned int>(*src++);
409
410
0
        if (sv < 0x80)
411
0
        {
412
0
          ++byteCount;
413
0
          Transcoder::encode(sv, out);
414
0
        }
415
0
        else
416
0
        {
417
0
          stat = APR_BADCH;
418
0
          break;
419
0
        }
420
0
      }
421
0
      in.increment_position(byteCount);
422
423
0
      return stat;
424
0
    }
425
426
427
428
  private:
429
    USASCIICharsetDecoder(const USASCIICharsetDecoder&);
430
    USASCIICharsetDecoder& operator=(const USASCIICharsetDecoder&);
431
};
432
433
/**
434
 *    Charset decoder that uses current locale settings.
435
 */
436
class LocaleCharsetDecoder : public CharsetDecoder
437
{
438
  public:
439
0
    LocaleCharsetDecoder() : state()
440
0
    {
441
0
    }
442
    log4cxx_status_t decode(ByteBuffer& in, LogString& out) override
443
0
    {
444
0
      log4cxx_status_t result = APR_SUCCESS;
445
0
      auto p = in.current();
446
0
      auto availableByteCount = in.remaining();
447
0
      size_t byteCount = 0;
448
0
#if !LOG4CXX_CHARSET_EBCDIC
449
0
      if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded?
450
0
      {
451
        // Copy single byte characters
452
0
        for (; byteCount < availableByteCount && static_cast<unsigned int>(*p) < 0x80; ++byteCount, ++p)
453
0
        {
454
0
          out.append(1, *p);
455
0
        }
456
0
      }
457
0
#endif
458
      // Decode characters that may be represented by multiple bytes
459
0
      while (byteCount < availableByteCount)
460
0
      {
461
0
        wchar_t ch = 0;
462
0
        size_t n = std::mbrtowc(&ch, p, availableByteCount - byteCount, &this->state);
463
0
        if (0 == n) // NULL encountered?
464
0
        {
465
0
          ++byteCount;
466
0
          break;
467
0
        }
468
0
        if (static_cast<std::size_t>(-1) == n) // decoding error?
469
0
        {
470
0
          result = APR_BADCH;
471
0
          break;
472
0
        }
473
0
        if (static_cast<std::size_t>(-2) == n) // incomplete sequence?
474
0
        {
475
0
          break;
476
0
        }
477
0
        Transcoder::encode(static_cast<unsigned int>(ch), out);
478
0
        byteCount += n;
479
0
        p += n;
480
0
      }
481
0
      in.increment_position(byteCount);
482
0
      return result;
483
0
    }
484
485
  private:
486
    std::mbstate_t state;
487
};
488
489
490
491
} // namespace helpers
492
493
}  //namespace log4cxx
494
495
496
CharsetDecoder::CharsetDecoder()
497
1
{
498
1
}
499
500
501
CharsetDecoder::~CharsetDecoder()
502
1
{
503
1
}
504
505
CharsetDecoder* CharsetDecoder::createDefaultDecoder()
506
1
{
507
1
#if LOG4CXX_CHARSET_UTF8
508
#if LOG4CXX_LOGCHAR_IS_UTF8
509
  return new TrivialCharsetDecoder();
510
#else
511
1
  return new UTF8CharsetDecoder();
512
1
#endif
513
#elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE)
514
  return new ISOLatinCharsetDecoder();
515
#elif LOG4CXX_CHARSET_USASCII
516
  return new USASCIICharsetDecoder();
517
#elif LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS
518
  return new MbstowcsCharsetDecoder();
519
#else
520
  return new LocaleCharsetDecoder();
521
#endif
522
1
}
523
524
CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
525
1
{
526
1
  static WideLife<CharsetDecoderPtr> decoder(createDefaultDecoder());
527
528
  //
529
  //  if invoked after static variable destruction
530
  //     (if logging is called in the destructor of a static object)
531
  //     then create a new decoder.
532
  //
533
1
  if (decoder.value() == 0)
534
0
  {
535
0
    return CharsetDecoderPtr( createDefaultDecoder() );
536
0
  }
537
538
1
  return decoder;
539
1
}
540
541
CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
542
0
{
543
0
  return std::make_shared<UTF8CharsetDecoder>();
544
0
}
545
546
CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
547
0
{
548
0
  return std::make_shared<ISOLatinCharsetDecoder>();
549
0
}
550
551
552
CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset)
553
0
{
554
0
  if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) ||
555
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) ||
556
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
557
0
  {
558
#if LOG4CXX_LOGCHAR_IS_UTF8
559
    return std::make_shared<TrivialCharsetDecoder>();
560
#else
561
0
    return std::make_shared<UTF8CharsetDecoder>();
562
0
#endif
563
0
  }
564
0
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) ||
565
0
    charset == LOG4CXX_STR("646") ||
566
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) ||
567
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) ||
568
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) ||
569
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127")))
570
0
  {
571
0
    return std::make_shared<USASCIICharsetDecoder>();
572
0
  }
573
0
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) ||
574
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) ||
575
0
    StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252")))
576
0
  {
577
0
    return std::make_shared<ISOLatinCharsetDecoder>();
578
0
  }
579
0
  else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale")))
580
0
  {
581
0
    return std::make_shared<LocaleCharsetDecoder>();
582
0
  }
583
584
0
#if APR_HAS_XLATE
585
0
  return std::make_shared<APRCharsetDecoder>(charset);
586
#else
587
  throw IllegalArgumentException(charset);
588
#endif
589
0
}
590
591
log4cxx_status_t CharsetDecoder::decode(const char* in, size_t maxByteCount, LogString& out)
592
0
{
593
0
  ByteBuffer buf((char*)in, strnlen_s(in, maxByteCount));
594
0
  return decode(buf, out);
595
0
}
596
597
unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in)
598
72.7M
{
599
72.7M
  auto availableByteCount = in.remaining();
600
72.7M
  if (0 == availableByteCount)
601
0
    return 0xFFFF;
602
603
72.7M
  auto pChar = in.current();
604
72.7M
  auto ch1 = static_cast<unsigned char>(*pChar);
605
72.7M
  if (ch1 <= 0x7F)
606
50.3M
  {
607
50.3M
    in.increment_position(1);
608
50.3M
    return ch1;
609
50.3M
  }
610
611
  //
612
  //   should not have continuation character here
613
  //
614
22.4M
  if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount)
615
5.62M
  {
616
5.62M
    auto ch2 = static_cast<unsigned char>(*(pChar + 1));
617
5.62M
    if ((ch2 & 0xC0) != 0x80) // not a continuation?
618
4.83M
      return 0xFFFF;
619
620
788k
    if ((ch1 & 0xE0) == 0xC0)
621
3.38k
    {
622
3.38k
      unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F);
623
3.38k
      if (rv >= 0x80)
624
2.45k
      {
625
2.45k
        in.increment_position(2);
626
2.45k
        return rv;
627
2.45k
      }
628
933
      return 0xFFFF;
629
3.38k
    }
630
631
785k
    if (2 < availableByteCount)
632
785k
    {
633
785k
      auto ch3 = static_cast<unsigned char>(*(pChar + 2));
634
785k
      if ((ch3 & 0xC0) != 0x80) // not a continuation?
635
323k
        return 0xFFFF;
636
637
461k
      if ((ch1 & 0xF0) == 0xE0)
638
130k
      {
639
130k
        unsigned int rv = ((ch1 & 0x0F) << 12)
640
130k
          + ((ch2 & 0x3F) << 6)
641
130k
          + (ch3 & 0x3F);
642
643
        // RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate
644
        // halves (U+D800..U+DFFF); accepting them lets malformed Unicode
645
        // cross the decode boundary into LogString and downstream output.
646
130k
        if (rv < 0x800 || (0xD800 <= rv && rv <= 0xDFFF))
647
123k
          return 0xFFFF;
648
649
7.23k
        in.increment_position(3);
650
7.23k
        return rv;
651
130k
      }
652
653
331k
      if (3 < availableByteCount)
654
330k
      {
655
330k
        auto ch4 = static_cast<unsigned char>(*(pChar + 3));
656
330k
        if ((ch4 & 0xC0) != 0x80) // not a continuation?
657
322k
          return 0xFFFF;
658
659
8.91k
        unsigned int rv = ((ch1 & 0x07) << 18)
660
8.91k
          + ((ch2 & 0x3F) << 12)
661
8.91k
          + ((ch3 & 0x3F) << 6)
662
8.91k
          + (ch4 & 0x3F);
663
664
        // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and
665
        // F4 with an over-high trailer) produce rv > 0x10FFFF, which
666
        // is not a Unicode code point. Without this bound, encodeUTF16
667
        // later silently aliases the bogus value to a valid in-range
668
        // code point — a substitution-collision filter-bypass primitive.
669
        // Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask
670
        // discards their high bits, so without the (ch1 & 0xF8) == 0xF0
671
        // guard F8 BF BF BF would alias to U+3FFFF instead of being
672
        // rejected.
673
8.91k
        if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF)
674
2.72k
        {
675
2.72k
          in.increment_position(4);
676
2.72k
          return rv;
677
2.72k
        }
678
679
8.91k
      }
680
331k
    }
681
785k
  }
682
16.8M
  return 0xFFFF;
683
22.4M
}