Coverage Report

Created: 2025-06-09 07:43

/src/gdal/port/cpl_recode.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions.
6
 * Author:   Andrey Kiselev, dron@ak4719.spb.edu
7
 *
8
 **********************************************************************
9
 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10
 * Copyright (c) 2008, Frank Warmerdam
11
 * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12
 *
13
 * Permission to use, copy, modify, and distribute this software for any
14
 * purpose with or without fee is hereby granted, provided that the above
15
 * copyright notice and this permission notice appear in all copies.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
 **********************************************************************/
25
26
#include "cpl_port.h"
27
#include "cpl_string.h"
28
29
#include <cstring>
30
31
#include "cpl_conv.h"
32
#include "cpl_character_sets.h"
33
34
#include "utf8.h"
35
36
#ifdef CPL_RECODE_ICONV
37
extern void CPLClearRecodeIconvWarningFlags();
38
extern char *CPLRecodeIconv(const char *, const char *,
39
                            const char *) CPL_RETURNS_NONNULL;
40
extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
41
                                     const char *);
42
extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
43
#endif  // CPL_RECODE_ICONV
44
45
extern void CPLClearRecodeStubWarningFlags();
46
extern char *CPLRecodeStub(const char *, const char *,
47
                           const char *) CPL_RETURNS_NONNULL;
48
extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
49
                                    const char *);
50
extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
51
extern int CPLIsUTF8Stub(const char *, int);
52
53
/************************************************************************/
54
/*                             CPLRecode()                              */
55
/************************************************************************/
56
57
/**
58
 * Convert a string from a source encoding to a destination encoding.
59
 *
60
 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
61
 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
62
 * <ul>
63
 *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
64
 *  fact)</li>
65
 *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
66
 *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
67
 * </ul>
68
 *
69
 * If an error occurs an error may, or may not be posted with CPLError().
70
 *
71
 * @param pszSource a NULL terminated string.
72
 * @param pszSrcEncoding the source encoding.
73
 * @param pszDstEncoding the destination encoding.
74
 *
75
 * @return a NULL terminated string which should be freed with CPLFree().
76
 *
77
 * @since GDAL 1.6.0
78
 */
79
80
char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
81
                        const char *pszDstEncoding)
82
83
8.81k
{
84
    /* -------------------------------------------------------------------- */
85
    /*      Handle a few common short cuts.                                 */
86
    /* -------------------------------------------------------------------- */
87
8.81k
    if (EQUAL(pszSrcEncoding, pszDstEncoding))
88
0
        return CPLStrdup(pszSource);
89
90
8.81k
    if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
91
8.81k
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
92
0
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
93
0
        return CPLStrdup(pszSource);
94
95
    // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
96
8.81k
    if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
97
8.81k
        CPLGetConversionTableToUTF8(pszSrcEncoding))
98
1.27k
    {
99
1.27k
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
100
1.27k
    }
101
102
7.54k
#ifdef CPL_RECODE_ICONV
103
    /* -------------------------------------------------------------------- */
104
    /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
105
    /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled   */
106
    /*      very well by the stub implementation which is faster than the   */
107
    /*      iconv() route. Use a stub for these two ones and iconv()        */
108
    /*      everything else.                                                */
109
    /* -------------------------------------------------------------------- */
110
7.54k
    if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
111
7.54k
         EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
112
7.54k
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
113
0
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
114
7.54k
    {
115
7.54k
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
116
7.54k
    }
117
#ifdef _WIN32
118
    else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
119
               EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
120
              EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
121
             (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
122
              (EQUAL(pszDstEncoding, "CP_ACP") ||
123
               EQUAL(pszDstEncoding, "CP_OEMCP"))))
124
    {
125
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
126
    }
127
#endif
128
0
    else
129
0
    {
130
0
        return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
131
0
    }
132
#else   // CPL_RECODE_STUB
133
    return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
134
#endif  // CPL_RECODE_ICONV
135
7.54k
}
136
137
/************************************************************************/
138
/*                         CPLRecodeFromWChar()                         */
139
/************************************************************************/
140
141
/**
142
 * Convert wchar_t string to UTF-8.
143
 *
144
 * Convert a wchar_t string into a multibyte utf-8 string.  The only
145
 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
146
 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
147
 * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
148
 * may also be supported.
149
 *
150
 * Note that the wchar_t type varies in size on different systems. On
151
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
152
 *
153
 * If an error occurs an error may, or may not be posted with CPLError().
154
 *
155
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
156
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
157
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
158
 *
159
 * @return a zero terminated multi-byte string which should be freed with
160
 * CPLFree(), or NULL if an error occurs.
161
 *
162
 * @since GDAL 1.6.0
163
 */
164
165
char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
166
                                 const char *pszSrcEncoding,
167
                                 const char *pszDstEncoding)
168
169
14.0k
{
170
14.0k
#ifdef CPL_RECODE_ICONV
171
    /* -------------------------------------------------------------------- */
172
    /*      Conversions from CPL_ENC_UCS2                                   */
173
    /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
174
    /*      handled by the stub implementation.                             */
175
    /* -------------------------------------------------------------------- */
176
14.0k
    if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
177
14.0k
         EQUAL(pszSrcEncoding, "WCHAR_T")) &&
178
14.0k
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
179
14.0k
         EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
180
14.0k
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
181
14.0k
    {
182
14.0k
        return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
183
14.0k
                                      pszDstEncoding);
184
14.0k
    }
185
186
0
    return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
187
188
#else   // CPL_RECODE_STUB
189
    return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
190
#endif  // CPL_RECODE_ICONV
191
14.0k
}
192
193
/************************************************************************/
194
/*                          CPLRecodeToWChar()                          */
195
/************************************************************************/
196
197
/**
198
 * Convert UTF-8 string to a wchar_t string.
199
 *
200
 * Convert a 8bit, multi-byte per character input string into a wide
201
 * character (wchar_t) string.  The only guaranteed supported source encodings
202
 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
203
 * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
204
 * and destination encodings may be supported depending on the underlying
205
 * implementation.
206
 *
207
 * Note that the wchar_t type varies in size on different systems. On
208
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
209
 *
210
 * If an error occurs an error may, or may not be posted with CPLError().
211
 *
212
 * @param pszSource input multi-byte character string.
213
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
214
 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
215
 *
216
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
217
 * NULL on error.
218
 *
219
 * @since GDAL 1.6.0
220
 */
221
222
wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
223
                                  const char *pszSrcEncoding,
224
                                  const char *pszDstEncoding)
225
226
0
{
227
0
#ifdef CPL_RECODE_ICONV
228
    /* -------------------------------------------------------------------- */
229
    /*      Conversions to CPL_ENC_UCS2                                     */
230
    /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
231
    /*      handled by the stub implementation.                             */
232
    /* -------------------------------------------------------------------- */
233
0
    if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
234
0
         EQUAL(pszDstEncoding, "WCHAR_T")) &&
235
0
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
236
0
         EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
237
0
         EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
238
0
    {
239
0
        return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
240
0
    }
241
242
0
    return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
243
244
#else   // CPL_RECODE_STUB
245
    return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
246
#endif  // CPL_RECODE_ICONV
247
0
}
248
249
/************************************************************************/
250
/*                               CPLIsASCII()                           */
251
/************************************************************************/
252
253
/**
254
 * Test if a string is encoded as ASCII.
255
 *
256
 * @param pabyData input string to test
257
 * @param nLen length of the input string, or -1 if the function must compute
258
 *             the string length. In which case it must be null terminated.
259
 * @return true if the string is encoded as ASCII. false otherwise
260
 *
261
 * @since GDAL 3.6.0
262
 */
263
bool CPLIsASCII(const char *pabyData, size_t nLen)
264
20.7k
{
265
20.7k
    if (nLen == static_cast<size_t>(-1))
266
20.7k
        nLen = strlen(pabyData);
267
186k
    for (size_t i = 0; i < nLen; ++i)
268
167k
    {
269
167k
        if (static_cast<unsigned char>(pabyData[i]) > 127)
270
2.50k
            return false;
271
167k
    }
272
18.2k
    return true;
273
20.7k
}
274
275
/************************************************************************/
276
/*                          CPLForceToASCII()                           */
277
/************************************************************************/
278
279
/**
280
 * Return a new string that is made only of ASCII characters. If non-ASCII
281
 * characters are found in the input string, they will be replaced by the
282
 * provided replacement character.
283
 *
284
 * This function does not make any assumption on the encoding of the input
285
 * string (except it must be nul-terminated if nLen equals -1, or have at
286
 * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
287
 * the input string is known to be UTF-8 encoded.
288
 *
289
 * @param pabyData input string to test
290
 * @param nLen length of the input string, or -1 if the function must compute
291
 *             the string length. In which case it must be null terminated.
292
293
 * @param chReplacementChar character which will be used when the input stream
294
 *                          contains a non ASCII character. Must be valid ASCII!
295
 *
296
 * @return a new string that must be freed with CPLFree().
297
 *
298
 * @since GDAL 1.7.0
299
 */
300
char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
301
0
{
302
0
    const size_t nRealLen =
303
0
        (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
304
0
    char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
305
0
    const char *pszPtr = pabyData;
306
0
    const char *pszEnd = pabyData + nRealLen;
307
0
    size_t i = 0;
308
0
    while (pszPtr != pszEnd)
309
0
    {
310
0
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
311
0
        {
312
0
            pszOutputString[i] = chReplacementChar;
313
0
            ++pszPtr;
314
0
            ++i;
315
0
        }
316
0
        else
317
0
        {
318
0
            pszOutputString[i] = *pszPtr;
319
0
            ++pszPtr;
320
0
            ++i;
321
0
        }
322
0
    }
323
0
    pszOutputString[i] = '\0';
324
0
    return pszOutputString;
325
0
}
326
327
/************************************************************************/
328
/*                       CPLUTF8ForceToASCII()                          */
329
/************************************************************************/
330
331
/**
332
 * Return a new string that is made only of ASCII characters. If non-ASCII
333
 * characters are found in the input string, for which an "equivalent" ASCII
334
 * character is not found, they will be replaced by the provided replacement
335
 * character.
336
 *
337
 * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
338
 * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
339
 * replacements for accented characters.
340
341
 * @param pszStr NUL-terminated UTF-8 string.
342
 * @param chReplacementChar character which will be used when the input stream
343
 *                          contains a non ASCII character that cannot be
344
 *                          substituted with an equivalent ASCII character.
345
 *                          Must be valid ASCII!
346
 *
347
 * @return a new string that must be freed with CPLFree().
348
 *
349
 * @since GDAL 3.9
350
 */
351
char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
352
0
{
353
0
    static const struct
354
0
    {
355
0
        short nCodePoint;
356
0
        char chFirst;
357
0
        char chSecond;
358
0
    } aLatinCharacters[] = {
359
        // https://en.wikipedia.org/wiki/Latin-1_Supplement
360
0
        {0xC0, 'A', 0},    // Latin Capital Letter A with grave
361
0
        {0xC1, 'A', 0},    // Latin Capital letter A with acute
362
0
        {0xC2, 'A', 0},    // Latin Capital letter A with circumflex
363
0
        {0xC3, 'A', 0},    // Latin Capital letter A with tilde
364
0
        {0xC4, 'A', 0},    // Latin Capital letter A with diaeresis
365
0
        {0xC5, 'A', 0},    // Latin Capital letter A with ring above
366
0
        {0xC6, 'A', 'E'},  // Latin Capital letter AE
367
0
        {0xC7, 'C', 0},    // Latin Capital letter C with cedilla
368
0
        {0xC8, 'E', 0},    // Latin Capital letter E with grave
369
0
        {0xC9, 'E', 0},    // Latin Capital letter E with acute
370
0
        {0xCA, 'E', 0},    // Latin Capital letter E with circumflex
371
0
        {0xCB, 'E', 0},    // Latin Capital letter E with diaeresis
372
0
        {0xCC, 'I', 0},    // Latin Capital letter I with grave
373
0
        {0xCD, 'I', 0},    // Latin Capital letter I with acute
374
0
        {0xCE, 'I', 0},    // Latin Capital letter I with circumflex
375
0
        {0xCF, 'I', 0},    // Latin Capital letter I with diaeresis
376
        // { 0xD0, '?', 0 }, // Latin Capital letter Eth
377
0
        {0xD1, 'N', 0},  // Latin Capital letter N with tilde
378
0
        {0xD2, 'O', 0},  // Latin Capital letter O with grave
379
0
        {0xD3, 'O', 0},  // Latin Capital letter O with acute
380
0
        {0xD4, 'O', 0},  // Latin Capital letter O with circumflex
381
0
        {0xD5, 'O', 0},  // Latin Capital letter O with tilde
382
0
        {0xD6, 'O', 0},  // Latin Capital letter O with diaeresis
383
0
        {0xD8, 'O', 0},  // Latin Capital letter O with stroke
384
0
        {0xD9, 'U', 0},  // Latin Capital letter U with grave
385
0
        {0xDA, 'U', 0},  // Latin Capital letter U with acute
386
0
        {0xDB, 'U', 0},  // Latin Capital Letter U with circumflex
387
0
        {0xDC, 'U', 0},  // Latin Capital Letter U with diaeresis
388
0
        {0xDD, 'Y', 0},  // Latin Capital Letter Y with acute
389
        // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
390
0
        {0xDF, 'S', 'S'},  // Latin Small Letter sharp S
391
0
        {0xE0, 'a', 0},    // Latin Small Letter A with grave
392
0
        {0xE1, 'a', 0},    // Latin Small Letter A with acute
393
0
        {0xE2, 'a', 0},    // Latin Small Letter A with circumflex
394
0
        {0xE3, 'a', 0},    // Latin Small Letter A with tilde
395
0
        {0xE4, 'a', 0},    // Latin Small Letter A with diaeresis
396
0
        {0xE5, 'a', 0},    // Latin Small Letter A with ring above
397
0
        {0xE6, 'a', 'e'},  // Latin Small Letter AE
398
0
        {0xE7, 'c', 0},    // Latin Small Letter C with cedilla
399
0
        {0xE8, 'e', 0},    // Latin Small Letter E with grave
400
0
        {0xE9, 'e', 0},    // Latin Small Letter E with acute
401
0
        {0xEA, 'e', 0},    // Latin Small Letter E with circumflex
402
0
        {0xEB, 'e', 0},    // Latin Small Letter E with diaeresis
403
0
        {0xEC, 'i', 0},    // Latin Small Letter I with grave
404
0
        {0xED, 'i', 0},    // Latin Small Letter I with acute
405
0
        {0xEE, 'i', 0},    // Latin Small Letter I with circumflex
406
0
        {0xEF, 'i', 0},    // Latin Small Letter I with diaeresis
407
        // { 0xF0, '?', 0 }, // Latin Small Letter Eth
408
0
        {0xF1, 'n', 0},  // Latin Small Letter N with tilde
409
0
        {0xF2, 'o', 0},  // Latin Small Letter O with grave
410
0
        {0xF3, 'o', 0},  // Latin Small Letter O with acute
411
0
        {0xF4, 'o', 0},  // Latin Small Letter O with circumflex
412
0
        {0xF5, 'o', 0},  // Latin Small Letter O with tilde
413
0
        {0xF6, 'o', 0},  // Latin Small Letter O with diaeresis
414
0
        {0xF8, 'o', 0},  // Latin Small Letter O with stroke
415
0
        {0xF9, 'u', 0},  // Latin Small Letter U with grave
416
0
        {0xFA, 'u', 0},  // Latin Small Letter U with acute
417
0
        {0xFB, 'u', 0},  // Latin Small Letter U with circumflex
418
0
        {0xFC, 'u', 0},  // Latin Small Letter U with diaeresis
419
0
        {0xFD, 'y', 0},  // Latin Small Letter Y with acute
420
        // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
421
0
        {0xFF, 'u', 0},  // Latin Small Letter Y with diaeresis
422
423
        // https://en.wikipedia.org/wiki/Latin_Extended-A
424
0
        {
425
0
            0x0100,
426
0
            'A',
427
0
            0,
428
0
        },  // Latin Capital letter A with macron
429
0
        {
430
0
            0x0101,
431
0
            'a',
432
0
            0,
433
0
        },  // Latin Small letter A with macron
434
0
        {
435
0
            0x0102,
436
0
            'A',
437
0
            0,
438
0
        },  // Latin Capital letter A with breve
439
0
        {
440
0
            0x0103,
441
0
            'a',
442
0
            0,
443
0
        },  // Latin Small letter A with breve
444
0
        {
445
0
            0x0104,
446
0
            'A',
447
0
            0,
448
0
        },  // Latin Capital letter A with ogonek
449
0
        {
450
0
            0x0105,
451
0
            'a',
452
0
            0,
453
0
        },  // Latin Small letter A with ogonek
454
0
        {
455
0
            0x0106,
456
0
            'C',
457
0
            0,
458
0
        },  // Latin Capital letter C with acute
459
0
        {
460
0
            0x0107,
461
0
            'c',
462
0
            0,
463
0
        },  // Latin Small letter C with acute
464
0
        {
465
0
            0x0108,
466
0
            'C',
467
0
            0,
468
0
        },  // Latin Capital letter C with circumflex
469
0
        {
470
0
            0x0109,
471
0
            'c',
472
0
            0,
473
0
        },  // Latin Small letter C with circumflex
474
0
        {
475
0
            0x010A,
476
0
            'C',
477
0
            0,
478
0
        },  // Latin Capital letter C with dot above
479
0
        {
480
0
            0x010B,
481
0
            'c',
482
0
            0,
483
0
        },  // Latin Small letter C with dot above
484
0
        {
485
0
            0x010C,
486
0
            'C',
487
0
            0,
488
0
        },  // Latin Capital letter C with caron
489
0
        {
490
0
            0x010D,
491
0
            'c',
492
0
            0,
493
0
        },  // Latin Small letter C with caron
494
0
        {
495
0
            0x010E,
496
0
            'D',
497
0
            0,
498
0
        },  // Latin Capital letter D with caron
499
0
        {
500
0
            0x010F,
501
0
            'd',
502
0
            0,
503
0
        },  // Latin Small letter D with caron
504
0
        {
505
0
            0x0110,
506
0
            'D',
507
0
            0,
508
0
        },  // Latin Capital letter D with stroke
509
0
        {
510
0
            0x0111,
511
0
            'd',
512
0
            0,
513
0
        },  // Latin Small letter D with stroke
514
0
        {
515
0
            0x0112,
516
0
            'E',
517
0
            0,
518
0
        },  // Latin Capital letter E with macron
519
0
        {
520
0
            0x0113,
521
0
            'e',
522
0
            0,
523
0
        },  // Latin Small letter E with macron
524
0
        {
525
0
            0x0114,
526
0
            'E',
527
0
            0,
528
0
        },  // Latin Capital letter E with breve
529
0
        {
530
0
            0x0115,
531
0
            'e',
532
0
            0,
533
0
        },  // Latin Small letter E with breve
534
0
        {
535
0
            0x0116,
536
0
            'E',
537
0
            0,
538
0
        },  // Latin Capital letter E with dot above
539
0
        {
540
0
            0x0117,
541
0
            'e',
542
0
            0,
543
0
        },  // Latin Small letter E with dot above
544
0
        {
545
0
            0x0118,
546
0
            'E',
547
0
            0,
548
0
        },  // Latin Capital letter E with ogonek
549
0
        {
550
0
            0x0119,
551
0
            'e',
552
0
            0,
553
0
        },  // Latin Small letter E with ogonek
554
0
        {
555
0
            0x011A,
556
0
            'E',
557
0
            0,
558
0
        },  // Latin Capital letter E with caron
559
0
        {
560
0
            0x011B,
561
0
            'e',
562
0
            0,
563
0
        },  // Latin Small letter E with caron
564
0
        {
565
0
            0x011C,
566
0
            'G',
567
0
            0,
568
0
        },  // Latin Capital letter G with circumflex
569
0
        {
570
0
            0x011D,
571
0
            'g',
572
0
            0,
573
0
        },  // Latin Small letter G with circumflex
574
0
        {
575
0
            0x011E,
576
0
            'G',
577
0
            0,
578
0
        },  // Latin Capital letter G with breve
579
0
        {
580
0
            0x011F,
581
0
            'g',
582
0
            0,
583
0
        },  // Latin Small letter G with breve
584
0
        {
585
0
            0x0120,
586
0
            'G',
587
0
            0,
588
0
        },  // Latin Capital letter G with dot above
589
0
        {
590
0
            0x0121,
591
0
            'g',
592
0
            0,
593
0
        },  // Latin Small letter G with dot above
594
0
        {
595
0
            0x0122,
596
0
            'G',
597
0
            0,
598
0
        },  // Latin Capital letter G with cedilla
599
0
        {
600
0
            0x0123,
601
0
            'g',
602
0
            0,
603
0
        },  // Latin Small letter G with cedilla
604
0
        {
605
0
            0x0124,
606
0
            'H',
607
0
            0,
608
0
        },  // Latin Capital letter H with circumflex
609
0
        {
610
0
            0x0125,
611
0
            'h',
612
0
            0,
613
0
        },  // Latin Small letter H with circumflex
614
0
        {
615
0
            0x0126,
616
0
            'H',
617
0
            0,
618
0
        },  // Latin Capital letter H with stroke
619
0
        {
620
0
            0x0127,
621
0
            'h',
622
0
            0,
623
0
        },  // Latin Small letter H with stroke
624
0
        {
625
0
            0x0128,
626
0
            'I',
627
0
            0,
628
0
        },  // Latin Capital letter I with tilde
629
0
        {
630
0
            0x0129,
631
0
            'i',
632
0
            0,
633
0
        },  // Latin Small letter I with tilde
634
0
        {
635
0
            0x012A,
636
0
            'I',
637
0
            0,
638
0
        },  // Latin Capital letter I with macron
639
0
        {
640
0
            0x012B,
641
0
            'i',
642
0
            0,
643
0
        },  // Latin Small letter I with macron
644
0
        {
645
0
            0x012C,
646
0
            'I',
647
0
            0,
648
0
        },  // Latin Capital letter I with breve
649
0
        {
650
0
            0x012D,
651
0
            'i',
652
0
            0,
653
0
        },  // Latin Small letter I with breve
654
0
        {
655
0
            0x012E,
656
0
            'I',
657
0
            0,
658
0
        },  // Latin Capital letter I with ogonek
659
0
        {
660
0
            0x012F,
661
0
            'i',
662
0
            0,
663
0
        },  // Latin Small letter I with ogonek
664
0
        {
665
0
            0x0130,
666
0
            'I',
667
0
            0,
668
0
        },  // Latin Capital letter I with dot above
669
0
        {
670
0
            0x0131,
671
0
            'i',
672
0
            0,
673
0
        },  // Latin Small letter dotless I
674
0
        {
675
0
            0x0132,
676
0
            'I',
677
0
            'J',
678
0
        },  // Latin Capital Ligature IJ
679
0
        {
680
0
            0x0133,
681
0
            'i',
682
0
            'j',
683
0
        },  // Latin Small Ligature IJ
684
0
        {
685
0
            0x0134,
686
0
            'J',
687
0
            0,
688
0
        },  // Latin Capital letter J with circumflex
689
0
        {
690
0
            0x0135,
691
0
            'j',
692
0
            0,
693
0
        },  // Latin Small letter J with circumflex
694
0
        {
695
0
            0x0136,
696
0
            'K',
697
0
            0,
698
0
        },  // Latin Capital letter K with cedilla
699
0
        {
700
0
            0x0137,
701
0
            'k',
702
0
            0,
703
0
        },  // Latin Small letter K with cedilla
704
0
        {
705
0
            0x0138,
706
0
            'k',
707
0
            0,
708
0
        },  // Latin Small letter Kra
709
0
        {
710
0
            0x0139,
711
0
            'L',
712
0
            0,
713
0
        },  // Latin Capital letter L with acute
714
0
        {
715
0
            0x013A,
716
0
            'l',
717
0
            0,
718
0
        },  // Latin Small letter L with acute
719
0
        {
720
0
            0x013B,
721
0
            'L',
722
0
            0,
723
0
        },  // Latin Capital letter L with cedilla
724
0
        {
725
0
            0x013C,
726
0
            'l',
727
0
            0,
728
0
        },  // Latin Small letter L with cedilla
729
0
        {
730
0
            0x013D,
731
0
            'L',
732
0
            0,
733
0
        },  // Latin Capital letter L with caron
734
0
        {
735
0
            0x013E,
736
0
            'l',
737
0
            0,
738
0
        },  // Latin Small letter L with caron
739
0
        {
740
0
            0x013F,
741
0
            'L',
742
0
            0,
743
0
        },  // Latin Capital letter L with middle dot
744
0
        {
745
0
            0x0140,
746
0
            'l',
747
0
            0,
748
0
        },  // Latin Small letter L with middle dot
749
0
        {
750
0
            0x0141,
751
0
            'L',
752
0
            0,
753
0
        },  // Latin Capital letter L with stroke
754
0
        {
755
0
            0x0142,
756
0
            'l',
757
0
            0,
758
0
        },  // Latin Small letter L with stroke
759
0
        {
760
0
            0x0143,
761
0
            'N',
762
0
            0,
763
0
        },  // Latin Capital letter N with acute
764
0
        {
765
0
            0x0144,
766
0
            'n',
767
0
            0,
768
0
        },  // Latin Small letter N with acute
769
0
        {
770
0
            0x0145,
771
0
            'N',
772
0
            0,
773
0
        },  // Latin Capital letter N with cedilla
774
0
        {
775
0
            0x0146,
776
0
            'n',
777
0
            0,
778
0
        },  // Latin Small letter N with cedilla
779
0
        {
780
0
            0x0147,
781
0
            'N',
782
0
            0,
783
0
        },  // Latin Capital letter N with caron
784
0
        {
785
0
            0x0148,
786
0
            'n',
787
0
            0,
788
0
        },  // Latin Small letter N with caron
789
        // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
790
        // { 0x014B , '?' , 0, }, // Latin Small letter Eng
791
0
        {
792
0
            0x014C,
793
0
            'O',
794
0
            0,
795
0
        },  // Latin Capital letter O with macron
796
0
        {
797
0
            0x014D,
798
0
            'o',
799
0
            0,
800
0
        },  // Latin Small letter O with macron
801
0
        {
802
0
            0x014E,
803
0
            'O',
804
0
            0,
805
0
        },  // Latin Capital letter O with breve
806
0
        {
807
0
            0x014F,
808
0
            'o',
809
0
            0,
810
0
        },  // Latin Small letter O with breve
811
0
        {
812
0
            0x0150,
813
0
            'O',
814
0
            0,
815
0
        },  // Latin Capital Letter O with double acute
816
0
        {
817
0
            0x0151,
818
0
            'o',
819
0
            0,
820
0
        },  // Latin Small Letter O with double acute
821
0
        {
822
0
            0x0152,
823
0
            'O',
824
0
            'E',
825
0
        },  // Latin Capital Ligature OE
826
0
        {
827
0
            0x0153,
828
0
            'o',
829
0
            'e',
830
0
        },  // Latin Small Ligature OE
831
0
        {
832
0
            0x0154,
833
0
            'R',
834
0
            0,
835
0
        },  // Latin Capital letter R with acute
836
0
        {
837
0
            0x0155,
838
0
            'r',
839
0
            0,
840
0
        },  // Latin Small letter R with acute
841
0
        {
842
0
            0x0156,
843
0
            'R',
844
0
            0,
845
0
        },  // Latin Capital letter R with cedilla
846
0
        {
847
0
            0x0157,
848
0
            'r',
849
0
            0,
850
0
        },  // Latin Small letter R with cedilla
851
0
        {
852
0
            0x0158,
853
0
            'R',
854
0
            0,
855
0
        },  // Latin Capital letter R with caron
856
0
        {
857
0
            0x0159,
858
0
            'r',
859
0
            0,
860
0
        },  // Latin Small letter R with caron
861
0
        {
862
0
            0x015A,
863
0
            'S',
864
0
            0,
865
0
        },  // Latin Capital letter S with acute
866
0
        {
867
0
            0x015B,
868
0
            's',
869
0
            0,
870
0
        },  // Latin Small letter S with acute
871
0
        {
872
0
            0x015C,
873
0
            'S',
874
0
            0,
875
0
        },  // Latin Capital letter S with circumflex
876
0
        {
877
0
            0x015D,
878
0
            's',
879
0
            0,
880
0
        },  // Latin Small letter S with circumflex
881
0
        {
882
0
            0x015E,
883
0
            'S',
884
0
            0,
885
0
        },  // Latin Capital letter S with cedilla
886
0
        {
887
0
            0x015F,
888
0
            's',
889
0
            0,
890
0
        },  // Latin Small letter S with cedilla
891
0
        {
892
0
            0x0160,
893
0
            'S',
894
0
            0,
895
0
        },  // Latin Capital letter S with caron
896
0
        {
897
0
            0x0161,
898
0
            's',
899
0
            0,
900
0
        },  // Latin Small letter S with caron
901
0
        {
902
0
            0x0162,
903
0
            'T',
904
0
            0,
905
0
        },  // Latin Capital letter T with cedilla
906
0
        {
907
0
            0x0163,
908
0
            't',
909
0
            0,
910
0
        },  // Latin Small letter T with cedilla
911
0
        {
912
0
            0x0164,
913
0
            'T',
914
0
            0,
915
0
        },  // Latin Capital letter T with caron
916
0
        {
917
0
            0x0165,
918
0
            't',
919
0
            0,
920
0
        },  // Latin Small letter T with caron
921
0
        {
922
0
            0x0166,
923
0
            'T',
924
0
            0,
925
0
        },  // Latin Capital letter T with stroke
926
0
        {
927
0
            0x0167,
928
0
            't',
929
0
            0,
930
0
        },  // Latin Small letter T with stroke
931
0
        {
932
0
            0x0168,
933
0
            'U',
934
0
            0,
935
0
        },  // Latin Capital letter U with tilde
936
0
        {
937
0
            0x0169,
938
0
            'u',
939
0
            0,
940
0
        },  // Latin Small letter U with tilde
941
0
        {
942
0
            0x016A,
943
0
            'U',
944
0
            0,
945
0
        },  // Latin Capital letter U with macron
946
0
        {
947
0
            0x016B,
948
0
            'u',
949
0
            0,
950
0
        },  // Latin Small letter U with macron
951
0
        {
952
0
            0x016C,
953
0
            'U',
954
0
            0,
955
0
        },  // Latin Capital letter U with breve
956
0
        {
957
0
            0x016D,
958
0
            'u',
959
0
            0,
960
0
        },  // Latin Small letter U with breve
961
0
        {
962
0
            0x016E,
963
0
            'U',
964
0
            0,
965
0
        },  // Latin Capital letter U with ring above
966
0
        {
967
0
            0x016F,
968
0
            'u',
969
0
            0,
970
0
        },  // Latin Small letter U with ring above
971
0
        {
972
0
            0x0170,
973
0
            'U',
974
0
            0,
975
0
        },  // Latin Capital Letter U with double acute
976
0
        {
977
0
            0x0171,
978
0
            'u',
979
0
            0,
980
0
        },  // Latin Small Letter U with double acute
981
0
        {
982
0
            0x0172,
983
0
            'U',
984
0
            0,
985
0
        },  // Latin Capital letter U with ogonek
986
0
        {
987
0
            0x0173,
988
0
            'u',
989
0
            0,
990
0
        },  // Latin Small letter U with ogonek
991
0
        {
992
0
            0x0174,
993
0
            'W',
994
0
            0,
995
0
        },  // Latin Capital letter W with circumflex
996
0
        {
997
0
            0x0175,
998
0
            'w',
999
0
            0,
1000
0
        },  // Latin Small letter W with circumflex
1001
0
        {
1002
0
            0x0176,
1003
0
            'Y',
1004
0
            0,
1005
0
        },  // Latin Capital letter Y with circumflex
1006
0
        {
1007
0
            0x0177,
1008
0
            'y',
1009
0
            0,
1010
0
        },  // Latin Small letter Y with circumflex
1011
0
        {
1012
0
            0x0178,
1013
0
            'Y',
1014
0
            0,
1015
0
        },  // Latin Capital letter Y with diaeresis
1016
0
        {
1017
0
            0x0179,
1018
0
            'Z',
1019
0
            0,
1020
0
        },  // Latin Capital letter Z with acute
1021
0
        {
1022
0
            0x017A,
1023
0
            'z',
1024
0
            0,
1025
0
        },  // Latin Small letter Z with acute
1026
0
        {
1027
0
            0x017B,
1028
0
            'Z',
1029
0
            0,
1030
0
        },  // Latin Capital letter Z with dot above
1031
0
        {
1032
0
            0x017C,
1033
0
            'z',
1034
0
            0,
1035
0
        },  // Latin Small letter Z with dot above
1036
0
        {
1037
0
            0x017D,
1038
0
            'Z',
1039
0
            0,
1040
0
        },  // Latin Capital letter Z with caron
1041
0
        {
1042
0
            0x017E,
1043
0
            'z',
1044
0
            0,
1045
0
        },  // Latin Small letter Z with caron
1046
0
    };
1047
1048
0
    const size_t nLen = strlen(pszStr);
1049
0
    char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
1050
0
    const char *pszPtr = pszStr;
1051
0
    const char *pszEnd = pszStr + nLen;
1052
0
    size_t i = 0;
1053
0
    while (pszPtr != pszEnd)
1054
0
    {
1055
0
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
1056
0
        {
1057
0
            utf8_int32_t codepoint;
1058
0
            if (pszPtr + utf8codepointcalcsize(
1059
0
                             reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
1060
0
                pszEnd)
1061
0
                break;
1062
0
            auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
1063
0
                reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
1064
0
            char ch = chReplacementChar;
1065
0
            for (const auto &latin1char : aLatinCharacters)
1066
0
            {
1067
0
                if (codepoint == latin1char.nCodePoint)
1068
0
                {
1069
0
                    pszOutputString[i] = latin1char.chFirst;
1070
0
                    ++i;
1071
0
                    if (latin1char.chSecond)
1072
0
                    {
1073
0
                        pszOutputString[i] = latin1char.chSecond;
1074
0
                        ++i;
1075
0
                    }
1076
0
                    ch = 0;
1077
0
                    break;
1078
0
                }
1079
0
            }
1080
0
            if (ch)
1081
0
            {
1082
0
                pszOutputString[i] = ch;
1083
0
                ++i;
1084
0
            }
1085
0
            pszPtr = pszNext;
1086
0
        }
1087
0
        else
1088
0
        {
1089
0
            pszOutputString[i] = *pszPtr;
1090
0
            ++pszPtr;
1091
0
            ++i;
1092
0
        }
1093
0
    }
1094
0
    pszOutputString[i] = '\0';
1095
0
    return pszOutputString;
1096
0
}
1097
1098
/************************************************************************/
1099
/*                        CPLEncodingCharSize()                         */
1100
/************************************************************************/
1101
1102
/**
1103
 * Return bytes per character for encoding.
1104
 *
1105
 * This function returns the size in bytes of the smallest character
1106
 * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
1107
 * is straight forward.  For encodings like UTF8 and UTF16 which represent
1108
 * some characters as a sequence of atomic character sizes the function
1109
 * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1110
 *
1111
 * This function will return the correct value for well known encodings
1112
 * with corresponding CPL_ENC_ values.  It may not return the correct value
1113
 * for other encodings even if they are supported by the underlying iconv
1114
 * or windows transliteration services.  Hopefully it will improve over time.
1115
 *
1116
 * @param pszEncoding the name of the encoding.
1117
 *
1118
 * @return the size of a minimal character in bytes or -1 if the size is
1119
 * unknown.
1120
 */
1121
1122
int CPLEncodingCharSize(const char *pszEncoding)
1123
1124
0
{
1125
0
    if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1126
0
        return 1;
1127
0
    else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1128
0
             EQUAL(pszEncoding, "UTF-16LE"))
1129
0
        return 2;
1130
0
    else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
1131
0
        return 2;
1132
0
    else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
1133
0
        return 4;
1134
0
    else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
1135
0
        return 1;
1136
0
    else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
1137
0
        return 1;
1138
1139
0
    return -1;
1140
0
}
1141
1142
/************************************************************************/
1143
/*                    CPLClearRecodeWarningFlags()                      */
1144
/************************************************************************/
1145
1146
void CPLClearRecodeWarningFlags()
1147
0
{
1148
0
#ifdef CPL_RECODE_ICONV
1149
0
    CPLClearRecodeIconvWarningFlags();
1150
0
#endif
1151
0
    CPLClearRecodeStubWarningFlags();
1152
0
}
1153
1154
/************************************************************************/
1155
/*                         CPLStrlenUTF8()                              */
1156
/************************************************************************/
1157
1158
/**
1159
 * Return the number of UTF-8 characters of a nul-terminated string.
1160
 *
1161
 * This is different from strlen() which returns the number of bytes.
1162
 *
1163
 * @param pszUTF8Str a nul-terminated UTF-8 string
1164
 *
1165
 * @return the number of UTF-8 characters.
1166
 */
1167
1168
int CPLStrlenUTF8(const char *pszUTF8Str)
1169
0
{
1170
0
    int nCharacterCount = 0;
1171
0
    for (int i = 0; pszUTF8Str[i] != '\0'; ++i)
1172
0
    {
1173
0
        if ((pszUTF8Str[i] & 0xc0) != 0x80)
1174
0
            ++nCharacterCount;
1175
0
    }
1176
0
    return nCharacterCount;
1177
0
}
1178
1179
/************************************************************************/
1180
/*                           CPLCanRecode()                             */
1181
/************************************************************************/
1182
1183
/**
1184
 * Checks if it is possible to recode a string from one encoding to another.
1185
 *
1186
 * @param pszTestStr a NULL terminated string.
1187
 * @param pszSrcEncoding the source encoding.
1188
 * @param pszDstEncoding the destination encoding.
1189
 *
1190
 * @return a TRUE if recode is possible.
1191
 *
1192
 * @since GDAL 3.1.0
1193
 */
1194
int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
1195
                 const char *pszDstEncoding)
1196
0
{
1197
0
    CPLClearRecodeWarningFlags();
1198
0
    CPLErrorReset();
1199
1200
0
    CPLPushErrorHandler(CPLQuietErrorHandler);
1201
0
    char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
1202
0
    CPLPopErrorHandler();
1203
1204
0
    if (pszRec == nullptr)
1205
0
    {
1206
0
        return FALSE;
1207
0
    }
1208
1209
0
    CPLFree(pszRec);
1210
1211
0
    if (CPLGetLastErrorType() != 0)
1212
0
    {
1213
0
        return FALSE;
1214
0
    }
1215
1216
0
    return TRUE;
1217
0
}