/src/logging-log4cxx/src/main/cpp/charsetdecoder.cpp
Line | Count | Source |
1 | | /* |
2 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
3 | | * contributor license agreements. See the NOTICE file distributed with |
4 | | * this work for additional information regarding copyright ownership. |
5 | | * The ASF licenses this file to You under the Apache License, Version 2.0 |
6 | | * (the "License"); you may not use this file except in compliance with |
7 | | * the License. You may obtain a copy of the License at |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | */ |
17 | | #define NOMINMAX /* tell windows not to define min/max macros */ |
18 | | #include <log4cxx/private/string_c11.h> |
19 | | #include <log4cxx/logstring.h> |
20 | | #include <log4cxx/helpers/charsetdecoder.h> |
21 | | #include <log4cxx/helpers/bytebuffer.h> |
22 | | #include <log4cxx/helpers/exception.h> |
23 | | #include <log4cxx/helpers/pool.h> |
24 | | #include <log4cxx/helpers/loglog.h> |
25 | | #include <apr_xlate.h> |
26 | | #if !defined(LOG4CXX) |
27 | | #define LOG4CXX 1 |
28 | | #endif |
29 | | #include <log4cxx/private/log4cxx_private.h> |
30 | | #include <locale.h> |
31 | | #include <apr_portable.h> |
32 | | #include <log4cxx/helpers/stringhelper.h> |
33 | | #include <log4cxx/helpers/transcoder.h> |
34 | | #include <mutex> |
35 | | |
36 | | using namespace LOG4CXX_NS; |
37 | | using namespace LOG4CXX_NS::helpers; |
38 | | |
39 | | IMPLEMENT_LOG4CXX_OBJECT(CharsetDecoder) |
40 | | |
41 | | |
42 | | namespace LOG4CXX_NS |
43 | | { |
44 | | namespace helpers |
45 | | { |
46 | | |
47 | | #if APR_HAS_XLATE |
48 | | /** |
49 | | * Converts from an arbitrary encoding to LogString |
50 | | * using apr_xlate. Requires real iconv implementation, |
51 | | * apr-iconv will crash in use. |
52 | | */ |
53 | | class APRCharsetDecoder : public CharsetDecoder |
54 | | { |
55 | | public: |
56 | | /** |
57 | | * Creates a new instance. |
58 | | * @param frompage name of source encoding. |
59 | | */ |
60 | 0 | APRCharsetDecoder(const LogString& frompage) : pool() |
61 | 0 | { |
62 | 0 | #if LOG4CXX_LOGCHAR_IS_WCHAR |
63 | 0 | const char* topage = "WCHAR_T"; |
64 | 0 | #endif |
65 | | #if LOG4CXX_LOGCHAR_IS_UTF8 |
66 | | const char* topage = "UTF-8"; |
67 | | #endif |
68 | | #if LOG4CXX_LOGCHAR_IS_UNICHAR |
69 | | const char* topage = "UTF-16"; |
70 | | #endif |
71 | 0 | std::string fpage(Transcoder::encodeCharsetName(frompage)); |
72 | 0 | apr_status_t stat = apr_xlate_open(&convset, |
73 | 0 | topage, |
74 | 0 | fpage.c_str(), |
75 | 0 | pool.getAPRPool()); |
76 | |
|
77 | 0 | if (stat != APR_SUCCESS) |
78 | 0 | { |
79 | 0 | throw IllegalArgumentException(frompage); |
80 | 0 | } |
81 | 0 | } |
82 | | |
83 | | /** |
84 | | * Destructor. |
85 | | */ |
86 | | virtual ~APRCharsetDecoder() |
87 | 0 | { |
88 | 0 | } |
89 | | |
90 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
91 | | LogString& out) |
92 | 0 | { |
93 | 0 | enum { BUFSIZE = 256 }; |
94 | 0 | logchar buf[BUFSIZE]; |
95 | 0 | const apr_size_t initial_outbytes_left = BUFSIZE * sizeof(logchar); |
96 | 0 | apr_status_t stat = APR_SUCCESS; |
97 | |
|
98 | 0 | if (in.remaining() == 0) |
99 | 0 | { |
100 | 0 | size_t outbytes_left = initial_outbytes_left; |
101 | 0 | { |
102 | 0 | std::lock_guard<std::mutex> lock(mutex); |
103 | 0 | stat = apr_xlate_conv_buffer((apr_xlate_t*) convset, |
104 | 0 | NULL, NULL, (char*) buf, &outbytes_left); |
105 | 0 | } |
106 | 0 | out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar)); |
107 | 0 | } |
108 | 0 | else |
109 | 0 | { |
110 | 0 | while (in.remaining() > 0 && stat == APR_SUCCESS) |
111 | 0 | { |
112 | 0 | size_t inbytes_left = in.remaining(); |
113 | 0 | size_t initial_inbytes_left = inbytes_left; |
114 | 0 | apr_size_t outbytes_left = initial_outbytes_left; |
115 | 0 | { |
116 | 0 | std::lock_guard<std::mutex> lock(mutex); |
117 | 0 | stat = apr_xlate_conv_buffer((apr_xlate_t*) convset, |
118 | 0 | in.current(), |
119 | 0 | &inbytes_left, |
120 | 0 | (char*) buf, |
121 | 0 | &outbytes_left); |
122 | 0 | } |
123 | 0 | out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar)); |
124 | 0 | if (inbytes_left == initial_inbytes_left && stat == APR_SUCCESS) |
125 | 0 | { |
126 | 0 | stat = APR_BADCH; |
127 | 0 | break; |
128 | 0 | } |
129 | 0 | in.increment_position(initial_inbytes_left - inbytes_left); |
130 | 0 | } |
131 | 0 | } |
132 | |
|
133 | 0 | return stat; |
134 | 0 | } |
135 | | |
136 | | private: |
137 | | APRCharsetDecoder(const APRCharsetDecoder&); |
138 | | APRCharsetDecoder& operator=(const APRCharsetDecoder&); |
139 | | LOG4CXX_NS::helpers::Pool pool; |
140 | | std::mutex mutex; |
141 | | apr_xlate_t* convset; |
142 | | }; |
143 | | |
144 | | #endif |
145 | | |
146 | | #if LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS |
147 | | /** |
148 | | * Converts from the default multi-byte string to |
149 | | * LogString using mbstowcs. |
150 | | * |
151 | | */ |
152 | | class MbstowcsCharsetDecoder : public CharsetDecoder |
153 | | { |
154 | | public: |
155 | | MbstowcsCharsetDecoder() |
156 | 0 | { |
157 | 0 | } |
158 | | |
159 | | virtual ~MbstowcsCharsetDecoder() |
160 | 0 | { |
161 | 0 | } |
162 | | |
163 | | private: |
164 | | inline log4cxx_status_t append(LogString& out, const wchar_t* buf) |
165 | 0 | { |
166 | 0 | out.append(buf); |
167 | 0 | return APR_SUCCESS; |
168 | 0 | } |
169 | | |
170 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
171 | | LogString& out) |
172 | 0 | { |
173 | 0 | log4cxx_status_t stat = APR_SUCCESS; |
174 | 0 | enum { BUFSIZE = 256 }; |
175 | 0 | wchar_t wbuf[BUFSIZE]; |
176 | 0 | char cbuf[BUFSIZE*4]; |
177 | 0 |
|
178 | 0 | mbstate_t mbstate; |
179 | 0 | memset(&mbstate, 0, sizeof(mbstate)); |
180 | 0 |
|
181 | 0 | while (in.remaining() > 0) |
182 | 0 | { |
183 | 0 | const char* src = in.current(); |
184 | 0 |
|
185 | 0 | if (*src == 0) |
186 | 0 | { |
187 | 0 | out.append(1, (logchar) 0); |
188 | 0 | in.increment_position(1); |
189 | 0 | } |
190 | 0 | else |
191 | 0 | { |
192 | 0 | auto available = std::min(sizeof (cbuf) - 1, in.remaining()); |
193 | 0 | strncpy(cbuf, src, available); |
194 | 0 | cbuf[available] = 0; |
195 | 0 | src = cbuf; |
196 | 0 | size_t wCharCount = mbsrtowcs(wbuf, |
197 | 0 | &src, |
198 | 0 | BUFSIZE - 1, |
199 | 0 | &mbstate); |
200 | 0 | // mbsrtowcs sets *src to nullptr when it consumes a null wide character. |
201 | 0 | // Performing pointer arithmetic on that nullptr (src - cbuf) is undefined |
202 | 0 | // behaviour, so recover the consumed byte count from the position of the |
203 | 0 | // null that stopped the conversion instead. |
204 | 0 | size_t converted; |
205 | 0 | if (src == nullptr) |
206 | 0 | { |
207 | 0 | size_t nullPos = 0; |
208 | 0 | while (nullPos < available && cbuf[nullPos] != 0) |
209 | 0 | { |
210 | 0 | ++nullPos; |
211 | 0 | } |
212 | 0 | // If the null came from the input bytes, it was consumed too; |
213 | 0 | // if it is the sentinel we wrote at cbuf[available], stop at available. |
214 | 0 | converted = (nullPos < available) ? nullPos + 1 : available; |
215 | 0 | } |
216 | 0 | else |
217 | 0 | { |
218 | 0 | converted = static_cast<size_t>(src - cbuf); |
219 | 0 | } |
220 | 0 | in.increment_position(converted); |
221 | 0 |
|
222 | 0 | if (wCharCount == (size_t) -1) // Illegal byte sequence? |
223 | 0 | { |
224 | 0 | LogString msg(LOG4CXX_STR("Illegal byte sequence at ")); |
225 | 0 | msg.append(std::to_wstring(in.position())); |
226 | 0 | msg.append(LOG4CXX_STR(" of ")); |
227 | 0 | msg.append(std::to_wstring(in.limit())); |
228 | 0 | LogLog::warn(msg); |
229 | 0 | stat = APR_BADCH; |
230 | 0 | break; |
231 | 0 | } |
232 | 0 | else |
233 | 0 | { |
234 | 0 | // FIX: Check for incomplete sequence infinite loop. |
235 | 0 | // If mbsrtowcs returns success (>=0) but converted 0 bytes while data remains, |
236 | 0 | // we are stuck (e.g. incomplete multibyte char at EOF). |
237 | 0 | if (converted == 0 && in.remaining() > 0) |
238 | 0 | { |
239 | 0 | LogString msg(LOG4CXX_STR("Incomplete multibyte sequence at end of buffer")); |
240 | 0 | LogLog::warn(msg); |
241 | 0 | stat = APR_BADCH; |
242 | 0 | break; // Break the infinite loop |
243 | 0 | } |
244 | 0 |
|
245 | 0 | wbuf[wCharCount] = 0; |
246 | 0 | stat = append(out, wbuf); |
247 | 0 | } |
248 | 0 | } |
249 | 0 | } |
250 | 0 |
|
251 | 0 | return stat; |
252 | 0 | } |
253 | | |
254 | | |
255 | | |
256 | | private: |
257 | | MbstowcsCharsetDecoder(const MbstowcsCharsetDecoder&); |
258 | | MbstowcsCharsetDecoder& operator=(const MbstowcsCharsetDecoder&); |
259 | | }; |
260 | | #endif |
261 | | |
262 | | |
263 | | /** |
264 | | * Decoder used when the external and internal charsets |
265 | | * are the same. |
266 | | * |
267 | | */ |
268 | | class TrivialCharsetDecoder : public CharsetDecoder |
269 | | { |
270 | | public: |
271 | | TrivialCharsetDecoder() |
272 | 0 | { |
273 | 0 | } |
274 | | |
275 | | virtual ~TrivialCharsetDecoder() |
276 | 0 | { |
277 | 0 | } |
278 | | |
279 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
280 | | LogString& out) |
281 | 0 | { |
282 | 0 | size_t remaining = in.remaining(); |
283 | 0 |
|
284 | 0 | if ( remaining > 0) |
285 | 0 | { |
286 | 0 | auto src = in.current(); |
287 | 0 | auto count = remaining / sizeof(logchar); |
288 | 0 | out.append(reinterpret_cast<const logchar*>(src), count); |
289 | 0 | in.increment_position(remaining); |
290 | 0 | } |
291 | 0 |
|
292 | 0 | return APR_SUCCESS; |
293 | 0 | } |
294 | | |
295 | | |
296 | | |
297 | | private: |
298 | | TrivialCharsetDecoder(const TrivialCharsetDecoder&); |
299 | | TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&); |
300 | | }; |
301 | | |
302 | | /** |
303 | | * Converts from UTF-8 to LogString |
304 | | * |
305 | | */ |
306 | | class UTF8CharsetDecoder : public CharsetDecoder |
307 | | { |
308 | | public: |
309 | | UTF8CharsetDecoder() |
310 | 1 | { |
311 | 1 | } |
312 | | |
313 | | virtual ~UTF8CharsetDecoder() |
314 | 0 | { |
315 | 0 | } |
316 | | |
317 | | private: |
318 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
319 | | LogString& out) |
320 | 22.4M | { |
321 | 22.4M | auto availableByteCount = in.remaining(); |
322 | 72.7M | while (0 < availableByteCount) |
323 | 72.7M | { |
324 | 72.7M | auto sv = getUTF8CodePoint(in); |
325 | 72.7M | auto nextAvailableByteCount = in.remaining(); |
326 | 72.7M | if (sv == 0xFFFF || nextAvailableByteCount == availableByteCount) |
327 | 22.4M | return APR_BADCH; |
328 | 50.3M | Transcoder::encode(sv, out); |
329 | 50.3M | availableByteCount = nextAvailableByteCount; |
330 | 50.3M | } |
331 | 1.95k | return APR_SUCCESS; |
332 | 22.4M | } |
333 | | |
334 | | private: |
335 | | UTF8CharsetDecoder(const UTF8CharsetDecoder&); |
336 | | UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&); |
337 | | }; |
338 | | |
339 | | /** |
340 | | * Converts from ISO-8859-1 to LogString. |
341 | | * |
342 | | */ |
343 | | class ISOLatinCharsetDecoder : public CharsetDecoder |
344 | | { |
345 | | public: |
346 | | ISOLatinCharsetDecoder() |
347 | 0 | { |
348 | 0 | } |
349 | | |
350 | | virtual ~ISOLatinCharsetDecoder() |
351 | 0 | { |
352 | 0 | } |
353 | | |
354 | | private: |
355 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
356 | | LogString& out) |
357 | 0 | { |
358 | 0 | auto availableByteCount = in.remaining(); |
359 | 0 | auto src = in.current(); |
360 | 0 | auto srcEnd = src + availableByteCount; |
361 | |
|
362 | 0 | while (src < srcEnd) |
363 | 0 | { |
364 | 0 | auto sv = static_cast<unsigned int>(static_cast<unsigned char>(*src++)); |
365 | 0 | Transcoder::encode(sv, out); |
366 | 0 | } |
367 | 0 | in.increment_position(availableByteCount); |
368 | |
|
369 | 0 | return APR_SUCCESS; |
370 | 0 | } |
371 | | |
372 | | |
373 | | |
374 | | private: |
375 | | ISOLatinCharsetDecoder(const ISOLatinCharsetDecoder&); |
376 | | ISOLatinCharsetDecoder& operator=(const ISOLatinCharsetDecoder&); |
377 | | }; |
378 | | |
379 | | |
380 | | /** |
381 | | * Converts from US-ASCII to LogString. |
382 | | * |
383 | | */ |
384 | | class USASCIICharsetDecoder : public CharsetDecoder |
385 | | { |
386 | | public: |
387 | | USASCIICharsetDecoder() |
388 | 0 | { |
389 | 0 | } |
390 | | |
391 | | virtual ~USASCIICharsetDecoder() |
392 | 0 | { |
393 | 0 | } |
394 | | |
395 | | private: |
396 | | |
397 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
398 | | LogString& out) |
399 | 0 | { |
400 | 0 | log4cxx_status_t stat = APR_SUCCESS; |
401 | |
|
402 | 0 | auto availableByteCount = in.remaining(); |
403 | 0 | auto src = in.current(); |
404 | 0 | auto srcEnd = src + availableByteCount; |
405 | 0 | size_t byteCount = 0; |
406 | 0 | while (src < srcEnd) |
407 | 0 | { |
408 | 0 | auto sv = static_cast<unsigned int>(*src++); |
409 | |
|
410 | 0 | if (sv < 0x80) |
411 | 0 | { |
412 | 0 | ++byteCount; |
413 | 0 | Transcoder::encode(sv, out); |
414 | 0 | } |
415 | 0 | else |
416 | 0 | { |
417 | 0 | stat = APR_BADCH; |
418 | 0 | break; |
419 | 0 | } |
420 | 0 | } |
421 | 0 | in.increment_position(byteCount); |
422 | |
|
423 | 0 | return stat; |
424 | 0 | } |
425 | | |
426 | | |
427 | | |
428 | | private: |
429 | | USASCIICharsetDecoder(const USASCIICharsetDecoder&); |
430 | | USASCIICharsetDecoder& operator=(const USASCIICharsetDecoder&); |
431 | | }; |
432 | | |
433 | | /** |
434 | | * Charset decoder that uses current locale settings. |
435 | | */ |
436 | | class LocaleCharsetDecoder : public CharsetDecoder |
437 | | { |
438 | | public: |
439 | 0 | LocaleCharsetDecoder() : state() |
440 | 0 | { |
441 | 0 | } |
442 | | log4cxx_status_t decode(ByteBuffer& in, LogString& out) override |
443 | 0 | { |
444 | 0 | log4cxx_status_t result = APR_SUCCESS; |
445 | 0 | auto p = in.current(); |
446 | 0 | auto availableByteCount = in.remaining(); |
447 | 0 | size_t byteCount = 0; |
448 | 0 | #if !LOG4CXX_CHARSET_EBCDIC |
449 | 0 | if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded? |
450 | 0 | { |
451 | | // Copy single byte characters |
452 | 0 | for (; byteCount < availableByteCount && static_cast<unsigned int>(*p) < 0x80; ++byteCount, ++p) |
453 | 0 | { |
454 | 0 | out.append(1, *p); |
455 | 0 | } |
456 | 0 | } |
457 | 0 | #endif |
458 | | // Decode characters that may be represented by multiple bytes |
459 | 0 | while (byteCount < availableByteCount) |
460 | 0 | { |
461 | 0 | wchar_t ch = 0; |
462 | 0 | size_t n = std::mbrtowc(&ch, p, availableByteCount - byteCount, &this->state); |
463 | 0 | if (0 == n) // NULL encountered? |
464 | 0 | { |
465 | 0 | ++byteCount; |
466 | 0 | break; |
467 | 0 | } |
468 | 0 | if (static_cast<std::size_t>(-1) == n) // decoding error? |
469 | 0 | { |
470 | 0 | result = APR_BADCH; |
471 | 0 | break; |
472 | 0 | } |
473 | 0 | if (static_cast<std::size_t>(-2) == n) // incomplete sequence? |
474 | 0 | { |
475 | 0 | break; |
476 | 0 | } |
477 | 0 | Transcoder::encode(static_cast<unsigned int>(ch), out); |
478 | 0 | byteCount += n; |
479 | 0 | p += n; |
480 | 0 | } |
481 | 0 | in.increment_position(byteCount); |
482 | 0 | return result; |
483 | 0 | } |
484 | | |
485 | | private: |
486 | | std::mbstate_t state; |
487 | | }; |
488 | | |
489 | | |
490 | | |
491 | | } // namespace helpers |
492 | | |
493 | | } //namespace log4cxx |
494 | | |
495 | | |
496 | | CharsetDecoder::CharsetDecoder() |
497 | 1 | { |
498 | 1 | } |
499 | | |
500 | | |
501 | | CharsetDecoder::~CharsetDecoder() |
502 | 1 | { |
503 | 1 | } |
504 | | |
505 | | CharsetDecoder* CharsetDecoder::createDefaultDecoder() |
506 | 1 | { |
507 | 1 | #if LOG4CXX_CHARSET_UTF8 |
508 | | #if LOG4CXX_LOGCHAR_IS_UTF8 |
509 | | return new TrivialCharsetDecoder(); |
510 | | #else |
511 | 1 | return new UTF8CharsetDecoder(); |
512 | 1 | #endif |
513 | | #elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE) |
514 | | return new ISOLatinCharsetDecoder(); |
515 | | #elif LOG4CXX_CHARSET_USASCII |
516 | | return new USASCIICharsetDecoder(); |
517 | | #elif LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS |
518 | | return new MbstowcsCharsetDecoder(); |
519 | | #else |
520 | | return new LocaleCharsetDecoder(); |
521 | | #endif |
522 | 1 | } |
523 | | |
524 | | CharsetDecoderPtr CharsetDecoder::getDefaultDecoder() |
525 | 1 | { |
526 | 1 | static WideLife<CharsetDecoderPtr> decoder(createDefaultDecoder()); |
527 | | |
528 | | // |
529 | | // if invoked after static variable destruction |
530 | | // (if logging is called in the destructor of a static object) |
531 | | // then create a new decoder. |
532 | | // |
533 | 1 | if (decoder.value() == 0) |
534 | 0 | { |
535 | 0 | return CharsetDecoderPtr( createDefaultDecoder() ); |
536 | 0 | } |
537 | | |
538 | 1 | return decoder; |
539 | 1 | } |
540 | | |
541 | | CharsetDecoderPtr CharsetDecoder::getUTF8Decoder() |
542 | 0 | { |
543 | 0 | return std::make_shared<UTF8CharsetDecoder>(); |
544 | 0 | } |
545 | | |
546 | | CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder() |
547 | 0 | { |
548 | 0 | return std::make_shared<ISOLatinCharsetDecoder>(); |
549 | 0 | } |
550 | | |
551 | | |
552 | | CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset) |
553 | 0 | { |
554 | 0 | if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) || |
555 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) || |
556 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001"))) |
557 | 0 | { |
558 | | #if LOG4CXX_LOGCHAR_IS_UTF8 |
559 | | return std::make_shared<TrivialCharsetDecoder>(); |
560 | | #else |
561 | 0 | return std::make_shared<UTF8CharsetDecoder>(); |
562 | 0 | #endif |
563 | 0 | } |
564 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) || |
565 | 0 | charset == LOG4CXX_STR("646") || |
566 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) || |
567 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) || |
568 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) || |
569 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127"))) |
570 | 0 | { |
571 | 0 | return std::make_shared<USASCIICharsetDecoder>(); |
572 | 0 | } |
573 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) || |
574 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) || |
575 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252"))) |
576 | 0 | { |
577 | 0 | return std::make_shared<ISOLatinCharsetDecoder>(); |
578 | 0 | } |
579 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale"))) |
580 | 0 | { |
581 | 0 | return std::make_shared<LocaleCharsetDecoder>(); |
582 | 0 | } |
583 | | |
584 | 0 | #if APR_HAS_XLATE |
585 | 0 | return std::make_shared<APRCharsetDecoder>(charset); |
586 | | #else |
587 | | throw IllegalArgumentException(charset); |
588 | | #endif |
589 | 0 | } |
590 | | |
591 | | log4cxx_status_t CharsetDecoder::decode(const char* in, size_t maxByteCount, LogString& out) |
592 | 0 | { |
593 | 0 | ByteBuffer buf((char*)in, strnlen_s(in, maxByteCount)); |
594 | 0 | return decode(buf, out); |
595 | 0 | } |
596 | | |
597 | | unsigned int CharsetDecoder::getUTF8CodePoint(ByteBuffer& in) |
598 | 72.7M | { |
599 | 72.7M | auto availableByteCount = in.remaining(); |
600 | 72.7M | if (0 == availableByteCount) |
601 | 0 | return 0xFFFF; |
602 | | |
603 | 72.7M | auto pChar = in.current(); |
604 | 72.7M | auto ch1 = static_cast<unsigned char>(*pChar); |
605 | 72.7M | if (ch1 <= 0x7F) |
606 | 50.3M | { |
607 | 50.3M | in.increment_position(1); |
608 | 50.3M | return ch1; |
609 | 50.3M | } |
610 | | |
611 | | // |
612 | | // should not have continuation character here |
613 | | // |
614 | 22.4M | if ((ch1 & 0xC0) != 0x80 && 1 < availableByteCount) |
615 | 5.62M | { |
616 | 5.62M | auto ch2 = static_cast<unsigned char>(*(pChar + 1)); |
617 | 5.62M | if ((ch2 & 0xC0) != 0x80) // not a continuation? |
618 | 4.83M | return 0xFFFF; |
619 | | |
620 | 788k | if ((ch1 & 0xE0) == 0xC0) |
621 | 3.38k | { |
622 | 3.38k | unsigned int rv = ((ch1 & 0x1F) << 6) + (ch2 & 0x3F); |
623 | 3.38k | if (rv >= 0x80) |
624 | 2.45k | { |
625 | 2.45k | in.increment_position(2); |
626 | 2.45k | return rv; |
627 | 2.45k | } |
628 | 933 | return 0xFFFF; |
629 | 3.38k | } |
630 | | |
631 | 785k | if (2 < availableByteCount) |
632 | 785k | { |
633 | 785k | auto ch3 = static_cast<unsigned char>(*(pChar + 2)); |
634 | 785k | if ((ch3 & 0xC0) != 0x80) // not a continuation? |
635 | 323k | return 0xFFFF; |
636 | | |
637 | 461k | if ((ch1 & 0xF0) == 0xE0) |
638 | 130k | { |
639 | 130k | unsigned int rv = ((ch1 & 0x0F) << 12) |
640 | 130k | + ((ch2 & 0x3F) << 6) |
641 | 130k | + (ch3 & 0x3F); |
642 | | |
643 | | // RFC 3629 §3 prohibits UTF-8 encodings of the UTF-16 surrogate |
644 | | // halves (U+D800..U+DFFF); accepting them lets malformed Unicode |
645 | | // cross the decode boundary into LogString and downstream output. |
646 | 130k | if (rv < 0x800 || (0xD800 <= rv && rv <= 0xDFFF)) |
647 | 123k | return 0xFFFF; |
648 | | |
649 | 7.23k | in.increment_position(3); |
650 | 7.23k | return rv; |
651 | 130k | } |
652 | | |
653 | 331k | if (3 < availableByteCount) |
654 | 330k | { |
655 | 330k | auto ch4 = static_cast<unsigned char>(*(pChar + 3)); |
656 | 330k | if ((ch4 & 0xC0) != 0x80) // not a continuation? |
657 | 322k | return 0xFFFF; |
658 | | |
659 | 8.91k | unsigned int rv = ((ch1 & 0x07) << 18) |
660 | 8.91k | + ((ch2 & 0x3F) << 12) |
661 | 8.91k | + ((ch3 & 0x3F) << 6) |
662 | 8.91k | + (ch4 & 0x3F); |
663 | | |
664 | | // RFC 3629 §3 caps UTF-8 at U+10FFFF; lead bytes F5..F7 (and |
665 | | // F4 with an over-high trailer) produce rv > 0x10FFFF, which |
666 | | // is not a Unicode code point. Without this bound, encodeUTF16 |
667 | | // later silently aliases the bogus value to a valid in-range |
668 | | // code point — a substitution-collision filter-bypass primitive. |
669 | | // Lead bytes F8..FF are never valid UTF-8, but the & 0x07 mask |
670 | | // discards their high bits, so without the (ch1 & 0xF8) == 0xF0 |
671 | | // guard F8 BF BF BF would alias to U+3FFFF instead of being |
672 | | // rejected. |
673 | 8.91k | if ((ch1 & 0xF8) == 0xF0 && rv > 0xFFFF && rv <= 0x10FFFF) |
674 | 2.72k | { |
675 | 2.72k | in.increment_position(4); |
676 | 2.72k | return rv; |
677 | 2.72k | } |
678 | | |
679 | 8.91k | } |
680 | 331k | } |
681 | 785k | } |
682 | 16.8M | return 0xFFFF; |
683 | 22.4M | } |