Coverage Report

Created: 2025-06-13 06:18

/src/gdal/port/cpl_recode_iconv.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode_iconv.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions implemented
6
 *           using the iconv() functionality.
7
 * Author:   Andrey Kiselev, dron@ak4719.spb.edu
8
 *
9
 **********************************************************************
10
 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11
 * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com>
12
 *
13
 * Permission to use, copy, modify, and distribute this software for any
14
 * purpose with or without fee is hereby granted, provided that the above
15
 * copyright notice and this permission notice appear in all copies.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
 **********************************************************************/
25
26
#include "cpl_port.h"
27
28
#include <algorithm>
29
30
#ifdef CPL_RECODE_ICONV
31
32
#include <iconv.h>
33
#include "cpl_string.h"
34
35
#ifndef ICONV_CPP_CONST
36
#define ICONV_CPP_CONST ICONV_CONST
37
#endif
38
39
constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768;
40
41
/* used by cpl_recode.cpp */
42
extern void CPLClearRecodeIconvWarningFlags();
43
extern char *CPLRecodeIconv(const char *, const char *,
44
                            const char *) CPL_RETURNS_NONNULL;
45
extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
46
                                     const char *);
47
extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
48
49
/************************************************************************/
50
/*                 CPLClearRecodeIconvWarningFlags()                    */
51
/************************************************************************/
52
53
static bool bHaveWarned1 = false;
54
static bool bHaveWarned2 = false;
55
56
void CPLClearRecodeIconvWarningFlags()
57
0
{
58
0
    bHaveWarned1 = false;
59
0
    bHaveWarned2 = false;
60
0
}
61
62
/************************************************************************/
63
/*                      CPLFixInputEncoding()                           */
64
/************************************************************************/
65
66
static const char *CPLFixInputEncoding(const char *pszSrcEncoding,
67
                                       int nFirstVal)
68
0
{
69
0
#if CPL_IS_LSB
70
    // iconv on Alpine Linux seems to assume BE order, when it is not explicit
71
0
    if (EQUAL(pszSrcEncoding, CPL_ENC_UCS2))
72
0
        pszSrcEncoding = "UCS-2LE";
73
0
    else if (EQUAL(pszSrcEncoding, CPL_ENC_UTF16) && nFirstVal != 0xFF &&
74
0
             nFirstVal != 0xFE && nFirstVal != 0xFFFE && nFirstVal != 0xFEFF)
75
0
    {
76
        // Only force UTF-16LE if there's no starting endianness marker
77
0
        pszSrcEncoding = "UTF-16LE";
78
0
    }
79
#else
80
    CPL_IGNORE_RET_VAL(nFirstVal);
81
#endif
82
0
    return pszSrcEncoding;
83
0
}
84
85
/************************************************************************/
86
/*                          CPLRecodeIconv()                            */
87
/************************************************************************/
88
89
/**
90
 * Convert a string from a source encoding to a destination encoding
91
 * using the iconv() function.
92
 *
93
 * If an error occurs an error may, or may not be posted with CPLError().
94
 *
95
 * @param pszSource a NULL terminated string.
96
 * @param pszSrcEncoding the source encoding.
97
 * @param pszDstEncoding the destination encoding.
98
 *
99
 * @return a NULL terminated string which should be freed with CPLFree().
100
 */
101
102
char *CPLRecodeIconv(const char *pszSource, const char *pszSrcEncoding,
103
                     const char *pszDstEncoding)
104
105
0
{
106
0
    pszSrcEncoding = CPLFixInputEncoding(
107
0
        pszSrcEncoding, static_cast<unsigned char>(pszSource[0]));
108
109
0
    iconv_t sConv;
110
111
0
    sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
112
113
0
#ifdef __GNUC__
114
0
#pragma GCC diagnostic push
115
0
#pragma GCC diagnostic ignored "-Wold-style-cast"
116
0
#endif
117
    // iconv_t might be a integer or a pointer, so we have to fallback to
118
    // C-style cast
119
0
    if (sConv == (iconv_t)(-1))
120
0
#ifdef __GNUC__
121
0
#pragma GCC diagnostic pop
122
0
#endif
123
0
    {
124
0
        CPLError(CE_Warning, CPLE_AppDefined,
125
0
                 "Recode from %s to %s failed with the error: \"%s\".",
126
0
                 pszSrcEncoding, pszDstEncoding, strerror(errno));
127
128
0
        return CPLStrdup(pszSource);
129
0
    }
130
131
    /* -------------------------------------------------------------------- */
132
    /*      XXX: There is a portability issue: iconv() function could be    */
133
    /*      declared differently on different platforms. The second         */
134
    /*      argument could be declared as char** (as POSIX defines) or      */
135
    /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
136
    /* -------------------------------------------------------------------- */
137
0
    ICONV_CPP_CONST char *pszSrcBuf =
138
0
        const_cast<ICONV_CPP_CONST char *>(pszSource);
139
0
    size_t nSrcLen = strlen(pszSource);
140
0
    size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen);
141
0
    size_t nDstLen = nDstCurLen;
142
0
    char *pszDestination =
143
0
        static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char)));
144
0
    char *pszDstBuf = pszDestination;
145
146
0
    while (nSrcLen > 0)
147
0
    {
148
0
        size_t nConverted =
149
0
            iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
150
151
0
        if (nConverted == static_cast<size_t>(-1))
152
0
        {
153
0
            if (errno == EILSEQ)
154
0
            {
155
                // Skip the invalid sequence in the input string.
156
0
                if (!bHaveWarned1)
157
0
                {
158
0
                    bHaveWarned1 = true;
159
0
                    CPLError(CE_Warning, CPLE_AppDefined,
160
0
                             "One or several characters couldn't be converted "
161
0
                             "correctly from %s to %s.  "
162
0
                             "This warning will not be emitted anymore",
163
0
                             pszSrcEncoding, pszDstEncoding);
164
0
                }
165
0
                if (nSrcLen == 0)
166
0
                    break;
167
0
                nSrcLen--;
168
0
                pszSrcBuf++;
169
0
                continue;
170
0
            }
171
172
0
            else if (errno == E2BIG)
173
0
            {
174
                // We are running out of the output buffer.
175
                // Dynamically increase the buffer size.
176
0
                size_t nTmp = nDstCurLen;
177
0
                nDstCurLen *= 2;
178
0
                pszDestination = static_cast<char *>(
179
0
                    CPLRealloc(pszDestination, nDstCurLen + 1));
180
0
                pszDstBuf = pszDestination + nTmp - nDstLen;
181
0
                nDstLen += nTmp;
182
0
                continue;
183
0
            }
184
185
0
            else
186
0
                break;
187
0
        }
188
0
    }
189
190
0
    pszDestination[nDstCurLen - nDstLen] = '\0';
191
192
0
    iconv_close(sConv);
193
194
0
    return pszDestination;
195
0
}
196
197
/************************************************************************/
198
/*                      CPLRecodeFromWCharIconv()                       */
199
/************************************************************************/
200
201
/**
202
 * Convert wchar_t string to UTF-8.
203
 *
204
 * Convert a wchar_t string into a multibyte utf-8 string
205
 * using the iconv() function.
206
 *
207
 * Note that the wchar_t type varies in size on different systems. On
208
 * win32 it is normally 2 bytes, and on unix 4 bytes.
209
 *
210
 * If an error occurs an error may, or may not be posted with CPLError().
211
 *
212
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
213
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
214
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
215
 *
216
 * @return a zero terminated multi-byte string which should be freed with
217
 * CPLFree(), or NULL if an error occurs.
218
 */
219
220
char *CPLRecodeFromWCharIconv(const wchar_t *pwszSource,
221
                              const char *pszSrcEncoding,
222
                              const char *pszDstEncoding)
223
224
0
{
225
0
    pszSrcEncoding = CPLFixInputEncoding(pszSrcEncoding, pwszSource[0]);
226
227
    /* -------------------------------------------------------------------- */
228
    /*      What is the source length.                                      */
229
    /* -------------------------------------------------------------------- */
230
0
    size_t nSrcLen = 0;
231
232
0
    while (pwszSource[nSrcLen] != 0)
233
0
        nSrcLen++;
234
235
    /* -------------------------------------------------------------------- */
236
    /*      iconv() does not support wchar_t so we need to repack the       */
237
    /*      characters according to the width of a character in the         */
238
    /*      source encoding.  For instance if wchar_t is 4 bytes but our    */
239
    /*      source is UTF16 then we need to pack down into 2 byte           */
240
    /*      characters before passing to iconv().                           */
241
    /* -------------------------------------------------------------------- */
242
0
    const int nTargetCharWidth = CPLEncodingCharSize(pszSrcEncoding);
243
244
0
    if (nTargetCharWidth < 1)
245
0
    {
246
0
        CPLError(CE_Warning, CPLE_AppDefined,
247
0
                 "Recode from %s with CPLRecodeFromWChar() failed because"
248
0
                 " the width of characters in the encoding are not known.",
249
0
                 pszSrcEncoding);
250
0
        return CPLStrdup("");
251
0
    }
252
253
0
    GByte *pszIconvSrcBuf =
254
0
        static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth));
255
256
0
    for (unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++)
257
0
    {
258
0
        if (nTargetCharWidth == 1)
259
0
            pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]);
260
0
        else if (nTargetCharWidth == 2)
261
0
            (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] =
262
0
                static_cast<short>(pwszSource[iSrc]);
263
0
        else if (nTargetCharWidth == 4)
264
0
            (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] =
265
0
                pwszSource[iSrc];
266
0
    }
267
268
    /* -------------------------------------------------------------------- */
269
    /*      Create the iconv() translation object.                          */
270
    /* -------------------------------------------------------------------- */
271
0
    iconv_t sConv;
272
273
0
    sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
274
275
0
#ifdef __GNUC__
276
0
#pragma GCC diagnostic push
277
0
#pragma GCC diagnostic ignored "-Wold-style-cast"
278
0
#endif
279
    // iconv_t might be a integer or a pointer, so we have to fallback to
280
    // C-style cast
281
0
    if (sConv == (iconv_t)(-1))
282
0
#ifdef __GNUC__
283
0
#pragma GCC diagnostic pop
284
0
#endif
285
0
    {
286
0
        CPLFree(pszIconvSrcBuf);
287
0
        CPLError(CE_Warning, CPLE_AppDefined,
288
0
                 "Recode from %s to %s failed with the error: \"%s\".",
289
0
                 pszSrcEncoding, pszDstEncoding, strerror(errno));
290
291
0
        return CPLStrdup("");
292
0
    }
293
294
    /* -------------------------------------------------------------------- */
295
    /*      XXX: There is a portability issue: iconv() function could be    */
296
    /*      declared differently on different platforms. The second         */
297
    /*      argument could be declared as char** (as POSIX defines) or      */
298
    /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
299
    /* -------------------------------------------------------------------- */
300
0
    ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(
301
0
        reinterpret_cast<char *>(pszIconvSrcBuf));
302
303
    /* iconv expects a number of bytes, not characters */
304
0
    nSrcLen *= nTargetCharWidth;
305
306
    /* -------------------------------------------------------------------- */
307
    /*      Allocate destination buffer.                                    */
308
    /* -------------------------------------------------------------------- */
309
0
    size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
310
0
    size_t nDstLen = nDstCurLen;
311
0
    char *pszDestination =
312
0
        static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char)));
313
0
    char *pszDstBuf = pszDestination;
314
315
0
    while (nSrcLen > 0)
316
0
    {
317
0
        const size_t nConverted =
318
0
            iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
319
320
0
        if (nConverted == static_cast<size_t>(-1))
321
0
        {
322
0
            if (errno == EILSEQ)
323
0
            {
324
                // Skip the invalid sequence in the input string.
325
0
                nSrcLen -= nTargetCharWidth;
326
0
                pszSrcBuf += nTargetCharWidth;
327
0
                if (!bHaveWarned2)
328
0
                {
329
0
                    bHaveWarned2 = true;
330
0
                    CPLError(CE_Warning, CPLE_AppDefined,
331
0
                             "One or several characters couldn't be converted "
332
0
                             "correctly from %s to %s.  "
333
0
                             "This warning will not be emitted anymore",
334
0
                             pszSrcEncoding, pszDstEncoding);
335
0
                }
336
0
                continue;
337
0
            }
338
339
0
            else if (errno == E2BIG)
340
0
            {
341
                // We are running out of the output buffer.
342
                // Dynamically increase the buffer size.
343
0
                size_t nTmp = nDstCurLen;
344
0
                nDstCurLen *= 2;
345
0
                pszDestination =
346
0
                    static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
347
0
                pszDstBuf = pszDestination + nTmp - nDstLen;
348
0
                nDstLen += nDstCurLen - nTmp;
349
0
                continue;
350
0
            }
351
352
0
            else
353
0
                break;
354
0
        }
355
0
    }
356
357
0
    if (nDstLen == 0)
358
0
    {
359
0
        ++nDstCurLen;
360
0
        pszDestination =
361
0
            static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
362
0
        ++nDstLen;
363
0
    }
364
0
    pszDestination[nDstCurLen - nDstLen] = '\0';
365
366
0
    iconv_close(sConv);
367
368
0
    CPLFree(pszIconvSrcBuf);
369
370
0
    return pszDestination;
371
0
}
372
373
/************************************************************************/
374
/*                        CPLRecodeToWCharIconv()                       */
375
/************************************************************************/
376
377
/**
378
 * Convert UTF-8 string to a wchar_t string.
379
 *
380
 * Convert a 8bit, multi-byte per character input string into a wide
381
 * character (wchar_t) string using the iconv() function.
382
 *
383
 * Note that the wchar_t type varies in size on different systems. On
384
 * win32 it is normally 2 bytes, and on unix 4 bytes.
385
 *
386
 * If an error occurs an error may, or may not be posted with CPLError().
387
 *
388
 * @param pszSource input multi-byte character string.
389
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
390
 * @param pszDstEncoding destination encoding. Must be "WCHAR_T".
391
 *
392
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
393
 * NULL on error.
394
 */
395
396
wchar_t *CPLRecodeToWCharIconv(const char *pszSource,
397
                               const char *pszSrcEncoding,
398
                               const char *pszDstEncoding)
399
400
0
{
401
0
    if (strcmp(pszDstEncoding, "WCHAR_T") != 0)
402
0
    {
403
0
        CPLError(CE_Failure, CPLE_AppDefined,
404
0
                 "Stub recoding implementation does not support "
405
0
                 "CPLRecodeToWCharIconv(...,%s,%s)",
406
0
                 pszSrcEncoding, pszDstEncoding);
407
0
        return nullptr;
408
0
    }
409
410
    // Using double static_cast<> makes CodeQL cpp/incorrect-string-type-conversion
411
    // check happy...
412
0
    return static_cast<wchar_t *>(static_cast<void *>(
413
0
        CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding)));
414
0
}
415
416
#endif /* CPL_RECODE_ICONV */