/src/gdal/port/cpl_recode_iconv.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * |
3 | | * Name: cpl_recode_iconv.cpp |
4 | | * Project: CPL - Common Portability Library |
5 | | * Purpose: Character set recoding and char/wchar_t conversions implemented |
6 | | * using the iconv() functionality. |
7 | | * Author: Andrey Kiselev, dron@ak4719.spb.edu |
8 | | * |
9 | | ********************************************************************** |
10 | | * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu> |
11 | | * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com> |
12 | | * |
13 | | * Permission to use, copy, modify, and distribute this software for any |
14 | | * purpose with or without fee is hereby granted, provided that the above |
15 | | * copyright notice and this permission notice appear in all copies. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
18 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
19 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
20 | | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
21 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
22 | | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
23 | | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
24 | | **********************************************************************/ |
25 | | |
26 | | #include "cpl_port.h" |
27 | | |
28 | | #include <algorithm> |
29 | | |
30 | | #ifdef CPL_RECODE_ICONV |
31 | | |
32 | | #include <iconv.h> |
33 | | #include "cpl_string.h" |
34 | | |
35 | | #ifndef ICONV_CPP_CONST |
36 | | #define ICONV_CPP_CONST ICONV_CONST |
37 | | #endif |
38 | | |
39 | | constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768; |
40 | | |
41 | | /* used by cpl_recode.cpp */ |
42 | | extern void CPLClearRecodeIconvWarningFlags(); |
43 | | extern char *CPLRecodeIconv(const char *, const char *, |
44 | | const char *) CPL_RETURNS_NONNULL; |
45 | | extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *, |
46 | | const char *); |
47 | | extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *); |
48 | | |
49 | | /************************************************************************/ |
50 | | /* CPLClearRecodeIconvWarningFlags() */ |
51 | | /************************************************************************/ |
52 | | |
53 | | static bool bHaveWarned1 = false; |
54 | | static bool bHaveWarned2 = false; |
55 | | |
56 | | void CPLClearRecodeIconvWarningFlags() |
57 | 0 | { |
58 | 0 | bHaveWarned1 = false; |
59 | 0 | bHaveWarned2 = false; |
60 | 0 | } |
61 | | |
62 | | /************************************************************************/ |
63 | | /* CPLFixInputEncoding() */ |
64 | | /************************************************************************/ |
65 | | |
66 | | static const char *CPLFixInputEncoding(const char *pszSrcEncoding, |
67 | | int nFirstVal) |
68 | 0 | { |
69 | 0 | #if CPL_IS_LSB |
70 | | // iconv on Alpine Linux seems to assume BE order, when it is not explicit |
71 | 0 | if (EQUAL(pszSrcEncoding, CPL_ENC_UCS2)) |
72 | 0 | pszSrcEncoding = "UCS-2LE"; |
73 | 0 | else if (EQUAL(pszSrcEncoding, CPL_ENC_UTF16) && nFirstVal != 0xFF && |
74 | 0 | nFirstVal != 0xFE && nFirstVal != 0xFFFE && nFirstVal != 0xFEFF) |
75 | 0 | { |
76 | | // Only force UTF-16LE if there's no starting endianness marker |
77 | 0 | pszSrcEncoding = "UTF-16LE"; |
78 | 0 | } |
79 | | #else |
80 | | CPL_IGNORE_RET_VAL(nFirstVal); |
81 | | #endif |
82 | 0 | return pszSrcEncoding; |
83 | 0 | } |
84 | | |
85 | | /************************************************************************/ |
86 | | /* CPLRecodeIconv() */ |
87 | | /************************************************************************/ |
88 | | |
89 | | /** |
90 | | * Convert a string from a source encoding to a destination encoding |
91 | | * using the iconv() function. |
92 | | * |
93 | | * If an error occurs an error may, or may not be posted with CPLError(). |
94 | | * |
95 | | * @param pszSource a NULL terminated string. |
96 | | * @param pszSrcEncoding the source encoding. |
97 | | * @param pszDstEncoding the destination encoding. |
98 | | * |
99 | | * @return a NULL terminated string which should be freed with CPLFree(). |
100 | | */ |
101 | | |
102 | | char *CPLRecodeIconv(const char *pszSource, const char *pszSrcEncoding, |
103 | | const char *pszDstEncoding) |
104 | | |
105 | 0 | { |
106 | 0 | pszSrcEncoding = CPLFixInputEncoding( |
107 | 0 | pszSrcEncoding, static_cast<unsigned char>(pszSource[0])); |
108 | |
|
109 | 0 | iconv_t sConv; |
110 | |
|
111 | 0 | sConv = iconv_open(pszDstEncoding, pszSrcEncoding); |
112 | |
|
113 | 0 | #ifdef __GNUC__ |
114 | 0 | #pragma GCC diagnostic push |
115 | 0 | #pragma GCC diagnostic ignored "-Wold-style-cast" |
116 | 0 | #endif |
117 | | // iconv_t might be a integer or a pointer, so we have to fallback to |
118 | | // C-style cast |
119 | 0 | if (sConv == (iconv_t)(-1)) |
120 | 0 | #ifdef __GNUC__ |
121 | 0 | #pragma GCC diagnostic pop |
122 | 0 | #endif |
123 | 0 | { |
124 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
125 | 0 | "Recode from %s to %s failed with the error: \"%s\".", |
126 | 0 | pszSrcEncoding, pszDstEncoding, strerror(errno)); |
127 | |
|
128 | 0 | return CPLStrdup(pszSource); |
129 | 0 | } |
130 | | |
131 | | /* -------------------------------------------------------------------- */ |
132 | | /* XXX: There is a portability issue: iconv() function could be */ |
133 | | /* declared differently on different platforms. The second */ |
134 | | /* argument could be declared as char** (as POSIX defines) or */ |
135 | | /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */ |
136 | | /* -------------------------------------------------------------------- */ |
137 | 0 | ICONV_CPP_CONST char *pszSrcBuf = |
138 | 0 | const_cast<ICONV_CPP_CONST char *>(pszSource); |
139 | 0 | size_t nSrcLen = strlen(pszSource); |
140 | 0 | size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen); |
141 | 0 | size_t nDstLen = nDstCurLen; |
142 | 0 | char *pszDestination = |
143 | 0 | static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char))); |
144 | 0 | char *pszDstBuf = pszDestination; |
145 | |
|
146 | 0 | while (nSrcLen > 0) |
147 | 0 | { |
148 | 0 | size_t nConverted = |
149 | 0 | iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen); |
150 | |
|
151 | 0 | if (nConverted == static_cast<size_t>(-1)) |
152 | 0 | { |
153 | 0 | if (errno == EILSEQ) |
154 | 0 | { |
155 | | // Skip the invalid sequence in the input string. |
156 | 0 | if (!bHaveWarned1) |
157 | 0 | { |
158 | 0 | bHaveWarned1 = true; |
159 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
160 | 0 | "One or several characters couldn't be converted " |
161 | 0 | "correctly from %s to %s. " |
162 | 0 | "This warning will not be emitted anymore", |
163 | 0 | pszSrcEncoding, pszDstEncoding); |
164 | 0 | } |
165 | 0 | if (nSrcLen == 0) |
166 | 0 | break; |
167 | 0 | nSrcLen--; |
168 | 0 | pszSrcBuf++; |
169 | 0 | continue; |
170 | 0 | } |
171 | | |
172 | 0 | else if (errno == E2BIG) |
173 | 0 | { |
174 | | // We are running out of the output buffer. |
175 | | // Dynamically increase the buffer size. |
176 | 0 | size_t nTmp = nDstCurLen; |
177 | 0 | nDstCurLen *= 2; |
178 | 0 | pszDestination = static_cast<char *>( |
179 | 0 | CPLRealloc(pszDestination, nDstCurLen + 1)); |
180 | 0 | pszDstBuf = pszDestination + nTmp - nDstLen; |
181 | 0 | nDstLen += nTmp; |
182 | 0 | continue; |
183 | 0 | } |
184 | | |
185 | 0 | else |
186 | 0 | break; |
187 | 0 | } |
188 | 0 | } |
189 | |
|
190 | 0 | pszDestination[nDstCurLen - nDstLen] = '\0'; |
191 | |
|
192 | 0 | iconv_close(sConv); |
193 | |
|
194 | 0 | return pszDestination; |
195 | 0 | } |
196 | | |
197 | | /************************************************************************/ |
198 | | /* CPLRecodeFromWCharIconv() */ |
199 | | /************************************************************************/ |
200 | | |
201 | | /** |
202 | | * Convert wchar_t string to UTF-8. |
203 | | * |
204 | | * Convert a wchar_t string into a multibyte utf-8 string |
205 | | * using the iconv() function. |
206 | | * |
207 | | * Note that the wchar_t type varies in size on different systems. On |
208 | | * win32 it is normally 2 bytes, and on unix 4 bytes. |
209 | | * |
210 | | * If an error occurs an error may, or may not be posted with CPLError(). |
211 | | * |
212 | | * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t. |
213 | | * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2. |
214 | | * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8. |
215 | | * |
216 | | * @return a zero terminated multi-byte string which should be freed with |
217 | | * CPLFree(), or NULL if an error occurs. |
218 | | */ |
219 | | |
220 | | char *CPLRecodeFromWCharIconv(const wchar_t *pwszSource, |
221 | | const char *pszSrcEncoding, |
222 | | const char *pszDstEncoding) |
223 | | |
224 | 0 | { |
225 | 0 | pszSrcEncoding = CPLFixInputEncoding(pszSrcEncoding, pwszSource[0]); |
226 | | |
227 | | /* -------------------------------------------------------------------- */ |
228 | | /* What is the source length. */ |
229 | | /* -------------------------------------------------------------------- */ |
230 | 0 | size_t nSrcLen = 0; |
231 | |
|
232 | 0 | while (pwszSource[nSrcLen] != 0) |
233 | 0 | nSrcLen++; |
234 | | |
235 | | /* -------------------------------------------------------------------- */ |
236 | | /* iconv() does not support wchar_t so we need to repack the */ |
237 | | /* characters according to the width of a character in the */ |
238 | | /* source encoding. For instance if wchar_t is 4 bytes but our */ |
239 | | /* source is UTF16 then we need to pack down into 2 byte */ |
240 | | /* characters before passing to iconv(). */ |
241 | | /* -------------------------------------------------------------------- */ |
242 | 0 | const int nTargetCharWidth = CPLEncodingCharSize(pszSrcEncoding); |
243 | |
|
244 | 0 | if (nTargetCharWidth < 1) |
245 | 0 | { |
246 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
247 | 0 | "Recode from %s with CPLRecodeFromWChar() failed because" |
248 | 0 | " the width of characters in the encoding are not known.", |
249 | 0 | pszSrcEncoding); |
250 | 0 | return CPLStrdup(""); |
251 | 0 | } |
252 | | |
253 | 0 | GByte *pszIconvSrcBuf = |
254 | 0 | static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth)); |
255 | |
|
256 | 0 | for (unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++) |
257 | 0 | { |
258 | 0 | if (nTargetCharWidth == 1) |
259 | 0 | pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]); |
260 | 0 | else if (nTargetCharWidth == 2) |
261 | 0 | (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] = |
262 | 0 | static_cast<short>(pwszSource[iSrc]); |
263 | 0 | else if (nTargetCharWidth == 4) |
264 | 0 | (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] = |
265 | 0 | pwszSource[iSrc]; |
266 | 0 | } |
267 | | |
268 | | /* -------------------------------------------------------------------- */ |
269 | | /* Create the iconv() translation object. */ |
270 | | /* -------------------------------------------------------------------- */ |
271 | 0 | iconv_t sConv; |
272 | |
|
273 | 0 | sConv = iconv_open(pszDstEncoding, pszSrcEncoding); |
274 | |
|
275 | 0 | #ifdef __GNUC__ |
276 | 0 | #pragma GCC diagnostic push |
277 | 0 | #pragma GCC diagnostic ignored "-Wold-style-cast" |
278 | 0 | #endif |
279 | | // iconv_t might be a integer or a pointer, so we have to fallback to |
280 | | // C-style cast |
281 | 0 | if (sConv == (iconv_t)(-1)) |
282 | 0 | #ifdef __GNUC__ |
283 | 0 | #pragma GCC diagnostic pop |
284 | 0 | #endif |
285 | 0 | { |
286 | 0 | CPLFree(pszIconvSrcBuf); |
287 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
288 | 0 | "Recode from %s to %s failed with the error: \"%s\".", |
289 | 0 | pszSrcEncoding, pszDstEncoding, strerror(errno)); |
290 | |
|
291 | 0 | return CPLStrdup(""); |
292 | 0 | } |
293 | | |
294 | | /* -------------------------------------------------------------------- */ |
295 | | /* XXX: There is a portability issue: iconv() function could be */ |
296 | | /* declared differently on different platforms. The second */ |
297 | | /* argument could be declared as char** (as POSIX defines) or */ |
298 | | /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */ |
299 | | /* -------------------------------------------------------------------- */ |
300 | 0 | ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>( |
301 | 0 | reinterpret_cast<char *>(pszIconvSrcBuf)); |
302 | | |
303 | | /* iconv expects a number of bytes, not characters */ |
304 | 0 | nSrcLen *= nTargetCharWidth; |
305 | | |
306 | | /* -------------------------------------------------------------------- */ |
307 | | /* Allocate destination buffer. */ |
308 | | /* -------------------------------------------------------------------- */ |
309 | 0 | size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1); |
310 | 0 | size_t nDstLen = nDstCurLen; |
311 | 0 | char *pszDestination = |
312 | 0 | static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char))); |
313 | 0 | char *pszDstBuf = pszDestination; |
314 | |
|
315 | 0 | while (nSrcLen > 0) |
316 | 0 | { |
317 | 0 | const size_t nConverted = |
318 | 0 | iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen); |
319 | |
|
320 | 0 | if (nConverted == static_cast<size_t>(-1)) |
321 | 0 | { |
322 | 0 | if (errno == EILSEQ) |
323 | 0 | { |
324 | | // Skip the invalid sequence in the input string. |
325 | 0 | nSrcLen -= nTargetCharWidth; |
326 | 0 | pszSrcBuf += nTargetCharWidth; |
327 | 0 | if (!bHaveWarned2) |
328 | 0 | { |
329 | 0 | bHaveWarned2 = true; |
330 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
331 | 0 | "One or several characters couldn't be converted " |
332 | 0 | "correctly from %s to %s. " |
333 | 0 | "This warning will not be emitted anymore", |
334 | 0 | pszSrcEncoding, pszDstEncoding); |
335 | 0 | } |
336 | 0 | continue; |
337 | 0 | } |
338 | | |
339 | 0 | else if (errno == E2BIG) |
340 | 0 | { |
341 | | // We are running out of the output buffer. |
342 | | // Dynamically increase the buffer size. |
343 | 0 | size_t nTmp = nDstCurLen; |
344 | 0 | nDstCurLen *= 2; |
345 | 0 | pszDestination = |
346 | 0 | static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen)); |
347 | 0 | pszDstBuf = pszDestination + nTmp - nDstLen; |
348 | 0 | nDstLen += nDstCurLen - nTmp; |
349 | 0 | continue; |
350 | 0 | } |
351 | | |
352 | 0 | else |
353 | 0 | break; |
354 | 0 | } |
355 | 0 | } |
356 | |
|
357 | 0 | if (nDstLen == 0) |
358 | 0 | { |
359 | 0 | ++nDstCurLen; |
360 | 0 | pszDestination = |
361 | 0 | static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen)); |
362 | 0 | ++nDstLen; |
363 | 0 | } |
364 | 0 | pszDestination[nDstCurLen - nDstLen] = '\0'; |
365 | |
|
366 | 0 | iconv_close(sConv); |
367 | |
|
368 | 0 | CPLFree(pszIconvSrcBuf); |
369 | |
|
370 | 0 | return pszDestination; |
371 | 0 | } |
372 | | |
373 | | /************************************************************************/ |
374 | | /* CPLRecodeToWCharIconv() */ |
375 | | /************************************************************************/ |
376 | | |
377 | | /** |
378 | | * Convert UTF-8 string to a wchar_t string. |
379 | | * |
380 | | * Convert a 8bit, multi-byte per character input string into a wide |
381 | | * character (wchar_t) string using the iconv() function. |
382 | | * |
383 | | * Note that the wchar_t type varies in size on different systems. On |
384 | | * win32 it is normally 2 bytes, and on unix 4 bytes. |
385 | | * |
386 | | * If an error occurs an error may, or may not be posted with CPLError(). |
387 | | * |
388 | | * @param pszSource input multi-byte character string. |
389 | | * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8. |
390 | | * @param pszDstEncoding destination encoding. Must be "WCHAR_T". |
391 | | * |
392 | | * @return the zero terminated wchar_t string (to be freed with CPLFree()) or |
393 | | * NULL on error. |
394 | | */ |
395 | | |
396 | | wchar_t *CPLRecodeToWCharIconv(const char *pszSource, |
397 | | const char *pszSrcEncoding, |
398 | | const char *pszDstEncoding) |
399 | | |
400 | 0 | { |
401 | 0 | if (strcmp(pszDstEncoding, "WCHAR_T") != 0) |
402 | 0 | { |
403 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
404 | 0 | "Stub recoding implementation does not support " |
405 | 0 | "CPLRecodeToWCharIconv(...,%s,%s)", |
406 | 0 | pszSrcEncoding, pszDstEncoding); |
407 | 0 | return nullptr; |
408 | 0 | } |
409 | | |
410 | | // Using double static_cast<> makes CodeQL cpp/incorrect-string-type-conversion |
411 | | // check happy... |
412 | 0 | return static_cast<wchar_t *>(static_cast<void *>( |
413 | 0 | CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding))); |
414 | 0 | } |
415 | | |
416 | | #endif /* CPL_RECODE_ICONV */ |