/src/logging-log4cxx/src/main/cpp/charsetdecoder.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
3 | | * contributor license agreements. See the NOTICE file distributed with |
4 | | * this work for additional information regarding copyright ownership. |
5 | | * The ASF licenses this file to You under the Apache License, Version 2.0 |
6 | | * (the "License"); you may not use this file except in compliance with |
7 | | * the License. You may obtain a copy of the License at |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | */ |
17 | | #define NOMINMAX /* tell windows not to define min/max macros */ |
18 | | #include <log4cxx/logstring.h> |
19 | | #include <log4cxx/helpers/charsetdecoder.h> |
20 | | #include <log4cxx/helpers/bytebuffer.h> |
21 | | #include <log4cxx/helpers/exception.h> |
22 | | #include <log4cxx/helpers/pool.h> |
23 | | #include <log4cxx/helpers/loglog.h> |
24 | | #include <apr_xlate.h> |
25 | | #if !defined(LOG4CXX) |
26 | | #define LOG4CXX 1 |
27 | | #endif |
28 | | #include <log4cxx/private/log4cxx_private.h> |
29 | | #include <locale.h> |
30 | | #include <apr_portable.h> |
31 | | #include <log4cxx/helpers/stringhelper.h> |
32 | | #include <log4cxx/helpers/transcoder.h> |
33 | | #include <mutex> |
34 | | |
35 | | using namespace LOG4CXX_NS; |
36 | | using namespace LOG4CXX_NS::helpers; |
37 | | |
38 | | IMPLEMENT_LOG4CXX_OBJECT(CharsetDecoder) |
39 | | |
40 | | |
41 | | namespace LOG4CXX_NS |
42 | | { |
43 | | namespace helpers |
44 | | { |
45 | | |
46 | | #if APR_HAS_XLATE |
47 | | /** |
48 | | * Converts from an arbitrary encoding to LogString |
49 | | * using apr_xlate. Requires real iconv implementation, |
50 | | * apr-iconv will crash in use. |
51 | | */ |
52 | | class APRCharsetDecoder : public CharsetDecoder |
53 | | { |
54 | | public: |
55 | | /** |
56 | | * Creates a new instance. |
57 | | * @param frompage name of source encoding. |
58 | | */ |
59 | 0 | APRCharsetDecoder(const LogString& frompage) : pool() |
60 | 0 | { |
61 | | #if LOG4CXX_LOGCHAR_IS_WCHAR |
62 | | const char* topage = "WCHAR_T"; |
63 | | #endif |
64 | 0 | #if LOG4CXX_LOGCHAR_IS_UTF8 |
65 | 0 | const char* topage = "UTF-8"; |
66 | 0 | #endif |
67 | | #if LOG4CXX_LOGCHAR_IS_UNICHAR |
68 | | const char* topage = "UTF-16"; |
69 | | #endif |
70 | 0 | std::string fpage(Transcoder::encodeCharsetName(frompage)); |
71 | 0 | apr_status_t stat = apr_xlate_open(&convset, |
72 | 0 | topage, |
73 | 0 | fpage.c_str(), |
74 | 0 | pool.getAPRPool()); |
75 | |
|
76 | 0 | if (stat != APR_SUCCESS) |
77 | 0 | { |
78 | 0 | throw IllegalArgumentException(frompage); |
79 | 0 | } |
80 | 0 | } |
81 | | |
82 | | /** |
83 | | * Destructor. |
84 | | */ |
85 | | virtual ~APRCharsetDecoder() |
86 | 0 | { |
87 | 0 | } |
88 | | |
89 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
90 | | LogString& out) |
91 | 0 | { |
92 | 0 | enum { BUFSIZE = 256 }; |
93 | 0 | logchar buf[BUFSIZE]; |
94 | 0 | const apr_size_t initial_outbytes_left = BUFSIZE * sizeof(logchar); |
95 | 0 | apr_status_t stat = APR_SUCCESS; |
96 | |
|
97 | 0 | if (in.remaining() == 0) |
98 | 0 | { |
99 | 0 | size_t outbytes_left = initial_outbytes_left; |
100 | 0 | { |
101 | 0 | std::lock_guard<std::mutex> lock(mutex); |
102 | 0 | stat = apr_xlate_conv_buffer((apr_xlate_t*) convset, |
103 | 0 | NULL, NULL, (char*) buf, &outbytes_left); |
104 | 0 | } |
105 | 0 | out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar)); |
106 | 0 | } |
107 | 0 | else |
108 | 0 | { |
109 | 0 | while (in.remaining() > 0 && stat == APR_SUCCESS) |
110 | 0 | { |
111 | 0 | size_t inbytes_left = in.remaining(); |
112 | 0 | size_t initial_inbytes_left = inbytes_left; |
113 | 0 | size_t pos = in.position(); |
114 | 0 | apr_size_t outbytes_left = initial_outbytes_left; |
115 | 0 | { |
116 | 0 | std::lock_guard<std::mutex> lock(mutex); |
117 | 0 | stat = apr_xlate_conv_buffer((apr_xlate_t*) convset, |
118 | 0 | in.data() + pos, |
119 | 0 | &inbytes_left, |
120 | 0 | (char*) buf, |
121 | 0 | &outbytes_left); |
122 | 0 | } |
123 | 0 | out.append(buf, (initial_outbytes_left - outbytes_left) / sizeof(logchar)); |
124 | 0 | in.position(pos + (initial_inbytes_left - inbytes_left)); |
125 | 0 | } |
126 | 0 | } |
127 | |
|
128 | 0 | return stat; |
129 | 0 | } |
130 | | |
131 | | private: |
132 | | APRCharsetDecoder(const APRCharsetDecoder&); |
133 | | APRCharsetDecoder& operator=(const APRCharsetDecoder&); |
134 | | LOG4CXX_NS::helpers::Pool pool; |
135 | | std::mutex mutex; |
136 | | apr_xlate_t* convset; |
137 | | }; |
138 | | |
139 | | #endif |
140 | | |
141 | | #if LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS |
142 | | /** |
143 | | * Converts from the default multi-byte string to |
144 | | * LogString using mbstowcs. |
145 | | * |
146 | | */ |
147 | | class MbstowcsCharsetDecoder : public CharsetDecoder |
148 | | { |
149 | | public: |
150 | | MbstowcsCharsetDecoder() |
151 | | { |
152 | | } |
153 | | |
154 | | virtual ~MbstowcsCharsetDecoder() |
155 | | { |
156 | | } |
157 | | |
158 | | private: |
159 | | inline log4cxx_status_t append(LogString& out, const wchar_t* buf) |
160 | | { |
161 | | out.append(buf); |
162 | | return APR_SUCCESS; |
163 | | } |
164 | | |
165 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
166 | | LogString& out) |
167 | | { |
168 | | log4cxx_status_t stat = APR_SUCCESS; |
169 | | enum { BUFSIZE = 256 }; |
170 | | wchar_t wbuf[BUFSIZE]; |
171 | | char cbuf[BUFSIZE*4]; |
172 | | |
173 | | mbstate_t mbstate; |
174 | | memset(&mbstate, 0, sizeof(mbstate)); |
175 | | |
176 | | while (in.remaining() > 0) |
177 | | { |
178 | | const char* src = in.current(); |
179 | | |
180 | | if (*src == 0) |
181 | | { |
182 | | out.append(1, (logchar) 0); |
183 | | in.position(in.position() + 1); |
184 | | } |
185 | | else |
186 | | { |
187 | | auto available = std::min(sizeof (cbuf) - 1, in.remaining()); |
188 | | strncpy(cbuf, src, available); |
189 | | cbuf[available] = 0; |
190 | | src = cbuf; |
191 | | size_t wCharCount = mbsrtowcs(wbuf, |
192 | | &src, |
193 | | BUFSIZE - 1, |
194 | | &mbstate); |
195 | | auto converted = src - cbuf; |
196 | | in.position(in.position() + converted); |
197 | | |
198 | | if (wCharCount == (size_t) -1) // Illegal byte sequence? |
199 | | { |
200 | | LogString msg(LOG4CXX_STR("Illegal byte sequence at ")); |
201 | | msg.append(std::to_wstring(in.position())); |
202 | | msg.append(LOG4CXX_STR(" of ")); |
203 | | msg.append(std::to_wstring(in.limit())); |
204 | | LogLog::warn(msg); |
205 | | stat = APR_BADCH; |
206 | | break; |
207 | | } |
208 | | else |
209 | | { |
210 | | wbuf[wCharCount] = 0; |
211 | | stat = append(out, wbuf); |
212 | | } |
213 | | } |
214 | | } |
215 | | |
216 | | return stat; |
217 | | } |
218 | | |
219 | | |
220 | | |
221 | | private: |
222 | | MbstowcsCharsetDecoder(const MbstowcsCharsetDecoder&); |
223 | | MbstowcsCharsetDecoder& operator=(const MbstowcsCharsetDecoder&); |
224 | | }; |
225 | | #endif |
226 | | |
227 | | |
228 | | /** |
229 | | * Decoder used when the external and internal charsets |
230 | | * are the same. |
231 | | * |
232 | | */ |
233 | | class TrivialCharsetDecoder : public CharsetDecoder |
234 | | { |
235 | | public: |
236 | | TrivialCharsetDecoder() |
237 | 0 | { |
238 | 0 | } |
239 | | |
240 | | virtual ~TrivialCharsetDecoder() |
241 | 0 | { |
242 | 0 | } |
243 | | |
244 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
245 | | LogString& out) |
246 | 0 | { |
247 | 0 | size_t remaining = in.remaining(); |
248 | |
|
249 | 0 | if ( remaining > 0) |
250 | 0 | { |
251 | 0 | const logchar* src = (const logchar*) (in.data() + in.position()); |
252 | 0 | size_t count = remaining / sizeof(logchar); |
253 | 0 | out.append(src, count); |
254 | 0 | in.position(in.position() + remaining); |
255 | 0 | } |
256 | |
|
257 | 0 | return APR_SUCCESS; |
258 | 0 | } |
259 | | |
260 | | |
261 | | |
262 | | private: |
263 | | TrivialCharsetDecoder(const TrivialCharsetDecoder&); |
264 | | TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&); |
265 | | }; |
266 | | |
267 | | |
268 | | #if LOG4CXX_LOGCHAR_IS_UTF8 |
269 | | typedef TrivialCharsetDecoder UTF8CharsetDecoder; |
270 | | #else |
271 | | /** |
272 | | * Converts from UTF-8 to std::wstring |
273 | | * |
274 | | */ |
275 | | class UTF8CharsetDecoder : public CharsetDecoder |
276 | | { |
277 | | public: |
278 | | UTF8CharsetDecoder() |
279 | | { |
280 | | } |
281 | | |
282 | | virtual ~UTF8CharsetDecoder() |
283 | | { |
284 | | } |
285 | | |
286 | | private: |
287 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
288 | | LogString& out) |
289 | | { |
290 | | if (in.remaining() > 0) |
291 | | { |
292 | | std::string tmp(in.current(), in.remaining()); |
293 | | std::string::const_iterator iter = tmp.begin(); |
294 | | |
295 | | while (iter != tmp.end()) |
296 | | { |
297 | | unsigned int sv = Transcoder::decode(tmp, iter); |
298 | | |
299 | | if (sv == 0xFFFF) |
300 | | { |
301 | | size_t offset = iter - tmp.begin(); |
302 | | in.position(in.position() + offset); |
303 | | return APR_BADARG; |
304 | | } |
305 | | else |
306 | | { |
307 | | Transcoder::encode(sv, out); |
308 | | } |
309 | | } |
310 | | |
311 | | in.position(in.limit()); |
312 | | } |
313 | | |
314 | | return APR_SUCCESS; |
315 | | } |
316 | | |
317 | | private: |
318 | | UTF8CharsetDecoder(const UTF8CharsetDecoder&); |
319 | | UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&); |
320 | | }; |
321 | | #endif |
322 | | |
323 | | /** |
324 | | * Converts from ISO-8859-1 to LogString. |
325 | | * |
326 | | */ |
327 | | class ISOLatinCharsetDecoder : public CharsetDecoder |
328 | | { |
329 | | public: |
330 | | ISOLatinCharsetDecoder() |
331 | 0 | { |
332 | 0 | } |
333 | | |
334 | | virtual ~ISOLatinCharsetDecoder() |
335 | 0 | { |
336 | 0 | } |
337 | | |
338 | | private: |
339 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
340 | | LogString& out) |
341 | 0 | { |
342 | 0 | if (in.remaining() > 0) |
343 | 0 | { |
344 | |
|
345 | 0 | const unsigned char* src = (unsigned char*) in.current(); |
346 | 0 | const unsigned char* srcEnd = src + in.remaining(); |
347 | |
|
348 | 0 | while (src < srcEnd) |
349 | 0 | { |
350 | 0 | unsigned int sv = *(src++); |
351 | 0 | Transcoder::encode(sv, out); |
352 | 0 | } |
353 | |
|
354 | 0 | in.position(in.limit()); |
355 | 0 | } |
356 | |
|
357 | 0 | return APR_SUCCESS; |
358 | 0 | } |
359 | | |
360 | | |
361 | | |
362 | | private: |
363 | | ISOLatinCharsetDecoder(const ISOLatinCharsetDecoder&); |
364 | | ISOLatinCharsetDecoder& operator=(const ISOLatinCharsetDecoder&); |
365 | | }; |
366 | | |
367 | | |
368 | | /** |
369 | | * Converts from US-ASCII to LogString. |
370 | | * |
371 | | */ |
372 | | class USASCIICharsetDecoder : public CharsetDecoder |
373 | | { |
374 | | public: |
375 | | USASCIICharsetDecoder() |
376 | 0 | { |
377 | 0 | } |
378 | | |
379 | | virtual ~USASCIICharsetDecoder() |
380 | 0 | { |
381 | 0 | } |
382 | | |
383 | | private: |
384 | | |
385 | | virtual log4cxx_status_t decode(ByteBuffer& in, |
386 | | LogString& out) |
387 | 0 | { |
388 | 0 | log4cxx_status_t stat = APR_SUCCESS; |
389 | |
|
390 | 0 | if (in.remaining() > 0) |
391 | 0 | { |
392 | |
|
393 | 0 | const unsigned char* src = (unsigned char*) in.current(); |
394 | 0 | const unsigned char* srcEnd = src + in.remaining(); |
395 | |
|
396 | 0 | while (src < srcEnd) |
397 | 0 | { |
398 | 0 | unsigned char sv = *src; |
399 | |
|
400 | 0 | if (sv < 0x80) |
401 | 0 | { |
402 | 0 | src++; |
403 | 0 | Transcoder::encode(sv, out); |
404 | 0 | } |
405 | 0 | else |
406 | 0 | { |
407 | 0 | stat = APR_BADARG; |
408 | 0 | break; |
409 | 0 | } |
410 | 0 | } |
411 | |
|
412 | 0 | in.position(src - (const unsigned char*) in.data()); |
413 | 0 | } |
414 | |
|
415 | 0 | return stat; |
416 | 0 | } |
417 | | |
418 | | |
419 | | |
420 | | private: |
421 | | USASCIICharsetDecoder(const USASCIICharsetDecoder&); |
422 | | USASCIICharsetDecoder& operator=(const USASCIICharsetDecoder&); |
423 | | }; |
424 | | |
425 | | /** |
426 | | * Charset decoder that uses current locale settings. |
427 | | */ |
428 | | class LocaleCharsetDecoder : public CharsetDecoder |
429 | | { |
430 | | public: |
431 | 0 | LocaleCharsetDecoder() : state() |
432 | 0 | { |
433 | 0 | } |
434 | | log4cxx_status_t decode(ByteBuffer& in, LogString& out) override |
435 | 0 | { |
436 | 0 | log4cxx_status_t result = APR_SUCCESS; |
437 | 0 | const char* p = in.current(); |
438 | 0 | size_t i = in.position(); |
439 | 0 | size_t remain = in.limit() - i; |
440 | 0 | #if !LOG4CXX_CHARSET_EBCDIC |
441 | 0 | if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded? |
442 | 0 | { |
443 | | // Copy single byte characters |
444 | 0 | for (; 0 < remain && ((unsigned int) *p) < 0x80; --remain, ++i, p++) |
445 | 0 | { |
446 | 0 | out.append(1, *p); |
447 | 0 | } |
448 | 0 | } |
449 | 0 | #endif |
450 | | // Decode characters that may be represented by multiple bytes |
451 | 0 | while (0 < remain) |
452 | 0 | { |
453 | 0 | wchar_t ch = 0; |
454 | 0 | size_t n = std::mbrtowc(&ch, p, remain, &this->state); |
455 | 0 | if (0 == n) // NULL encountered? |
456 | 0 | { |
457 | 0 | ++i; |
458 | 0 | break; |
459 | 0 | } |
460 | 0 | if (static_cast<std::size_t>(-1) == n) // decoding error? |
461 | 0 | { |
462 | 0 | result = APR_BADARG; |
463 | 0 | break; |
464 | 0 | } |
465 | 0 | if (static_cast<std::size_t>(-2) == n) // incomplete sequence? |
466 | 0 | { |
467 | 0 | break; |
468 | 0 | } |
469 | 0 | Transcoder::encode(static_cast<unsigned int>(ch), out); |
470 | 0 | remain -= n; |
471 | 0 | i += n; |
472 | 0 | p += n; |
473 | 0 | } |
474 | 0 | in.position(i); |
475 | 0 | return result; |
476 | 0 | } |
477 | | |
478 | | private: |
479 | | std::mbstate_t state; |
480 | | }; |
481 | | |
482 | | |
483 | | |
484 | | } // namespace helpers |
485 | | |
486 | | } //namespace log4cxx |
487 | | |
488 | | |
489 | | CharsetDecoder::CharsetDecoder() |
490 | 0 | { |
491 | 0 | } |
492 | | |
493 | | |
494 | | CharsetDecoder::~CharsetDecoder() |
495 | 0 | { |
496 | 0 | } |
497 | | |
498 | | CharsetDecoder* CharsetDecoder::createDefaultDecoder() |
499 | 0 | { |
500 | 0 | #if LOG4CXX_CHARSET_UTF8 |
501 | 0 | return new UTF8CharsetDecoder(); |
502 | | #elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE) |
503 | | return new ISOLatinCharsetDecoder(); |
504 | | #elif LOG4CXX_CHARSET_USASCII |
505 | | return new USASCIICharsetDecoder(); |
506 | | #elif LOG4CXX_LOGCHAR_IS_WCHAR && LOG4CXX_HAS_MBSRTOWCS |
507 | | return new MbstowcsCharsetDecoder(); |
508 | | #else |
509 | | return new LocaleCharsetDecoder(); |
510 | | #endif |
511 | 0 | } |
512 | | |
513 | | CharsetDecoderPtr CharsetDecoder::getDefaultDecoder() |
514 | 0 | { |
515 | 0 | static WideLife<CharsetDecoderPtr> decoder(createDefaultDecoder()); |
516 | | |
517 | | // |
518 | | // if invoked after static variable destruction |
519 | | // (if logging is called in the destructor of a static object) |
520 | | // then create a new decoder. |
521 | | // |
522 | 0 | if (decoder.value() == 0) |
523 | 0 | { |
524 | 0 | return CharsetDecoderPtr( createDefaultDecoder() ); |
525 | 0 | } |
526 | | |
527 | 0 | return decoder; |
528 | 0 | } |
529 | | |
530 | | CharsetDecoderPtr CharsetDecoder::getUTF8Decoder() |
531 | 0 | { |
532 | 0 | static WideLife<CharsetDecoderPtr> decoder(new UTF8CharsetDecoder()); |
533 | | |
534 | | // |
535 | | // if invoked after static variable destruction |
536 | | // (if logging is called in the destructor of a static object) |
537 | | // then create a new decoder. |
538 | | // |
539 | 0 | if (decoder.value() == 0) |
540 | 0 | { |
541 | 0 | return std::make_shared<UTF8CharsetDecoder>(); |
542 | 0 | } |
543 | | |
544 | 0 | return decoder; |
545 | 0 | } |
546 | | |
547 | | CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder() |
548 | 0 | { |
549 | 0 | return std::make_shared<ISOLatinCharsetDecoder>(); |
550 | 0 | } |
551 | | |
552 | | |
553 | | CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset) |
554 | 0 | { |
555 | 0 | if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) || |
556 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) || |
557 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001"))) |
558 | 0 | { |
559 | 0 | return std::make_shared<UTF8CharsetDecoder>(); |
560 | 0 | } |
561 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) || |
562 | 0 | charset == LOG4CXX_STR("646") || |
563 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) || |
564 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) || |
565 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) || |
566 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127"))) |
567 | 0 | { |
568 | 0 | return std::make_shared<USASCIICharsetDecoder>(); |
569 | 0 | } |
570 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) || |
571 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) || |
572 | 0 | StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252"))) |
573 | 0 | { |
574 | 0 | return std::make_shared<ISOLatinCharsetDecoder>(); |
575 | 0 | } |
576 | 0 | else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale"))) |
577 | 0 | { |
578 | 0 | return std::make_shared<LocaleCharsetDecoder>(); |
579 | 0 | } |
580 | | |
581 | 0 | #if APR_HAS_XLATE |
582 | 0 | return std::make_shared<APRCharsetDecoder>(charset); |
583 | | #else |
584 | | throw IllegalArgumentException(charset); |
585 | | #endif |
586 | 0 | } |
587 | | |
588 | | |
589 | | |
590 | | |
591 | | |
592 | | |