Coverage Report

Created: 2025-11-16 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/port/cpl_recode.cpp
Line
Count
Source
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions.
6
 * Author:   Andrey Kiselev, dron@ak4719.spb.edu
7
 *
8
 **********************************************************************
9
 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10
 * Copyright (c) 2008, Frank Warmerdam
11
 * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12
 *
13
 * Permission to use, copy, modify, and distribute this software for any
14
 * purpose with or without fee is hereby granted, provided that the above
15
 * copyright notice and this permission notice appear in all copies.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
 **********************************************************************/
25
26
#include "cpl_port.h"
27
#include "cpl_string.h"
28
29
#include <cstring>
30
31
#include "cpl_conv.h"
32
#include "cpl_character_sets.h"
33
34
#include "utf8.h"
35
36
#ifdef CPL_RECODE_ICONV
37
extern void CPLClearRecodeIconvWarningFlags();
38
extern char *CPLRecodeIconv(const char *, const char *,
39
                            const char *) CPL_RETURNS_NONNULL;
40
extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
41
                                     const char *);
42
extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
43
#endif  // CPL_RECODE_ICONV
44
45
extern void CPLClearRecodeStubWarningFlags();
46
extern char *CPLRecodeStub(const char *, const char *,
47
                           const char *) CPL_RETURNS_NONNULL;
48
extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
49
                                    const char *);
50
extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
51
extern int CPLIsUTF8Stub(const char *, int);
52
53
/************************************************************************/
54
/*                             CPLRecode()                              */
55
/************************************************************************/
56
57
/**
58
 * Convert a string from a source encoding to a destination encoding.
59
 *
60
 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
61
 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
62
 * <ul>
63
 *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
64
 *  fact)</li>
65
 *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
66
 *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
67
 * </ul>
68
 *
69
 * If an error occurs an error may, or may not be posted with CPLError().
70
 *
71
 * @param pszSource a NULL terminated string.
72
 * @param pszSrcEncoding the source encoding.
73
 * @param pszDstEncoding the destination encoding.
74
 *
75
 * @return a NULL terminated string which should be freed with CPLFree().
76
 *
77
 */
78
79
char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
80
                        const char *pszDstEncoding)
81
82
0
{
83
    /* -------------------------------------------------------------------- */
84
    /*      Handle a few common short cuts.                                 */
85
    /* -------------------------------------------------------------------- */
86
0
    if (EQUAL(pszSrcEncoding, pszDstEncoding))
87
0
        return CPLStrdup(pszSource);
88
89
0
    if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
90
0
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
91
0
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
92
0
        return CPLStrdup(pszSource);
93
94
    // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
95
0
    if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
96
0
        CPLGetConversionTableToUTF8(pszSrcEncoding))
97
0
    {
98
0
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
99
0
    }
100
101
0
#ifdef CPL_RECODE_ICONV
102
    /* -------------------------------------------------------------------- */
103
    /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
104
    /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled   */
105
    /*      very well by the stub implementation which is faster than the   */
106
    /*      iconv() route. Use a stub for these two ones and iconv()        */
107
    /*      everything else.                                                */
108
    /* -------------------------------------------------------------------- */
109
0
    if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
110
0
         EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
111
0
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
112
0
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
113
0
    {
114
0
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
115
0
    }
116
#ifdef _WIN32
117
    else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
118
               EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
119
              EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
120
             (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
121
              (EQUAL(pszDstEncoding, "CP_ACP") ||
122
               EQUAL(pszDstEncoding, "CP_OEMCP"))))
123
    {
124
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
125
    }
126
#endif
127
0
    else
128
0
    {
129
0
        return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
130
0
    }
131
#else   // CPL_RECODE_STUB
132
    return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
133
#endif  // CPL_RECODE_ICONV
134
0
}
135
136
/************************************************************************/
137
/*                         CPLRecodeFromWChar()                         */
138
/************************************************************************/
139
140
/**
141
 * Convert wchar_t string to UTF-8.
142
 *
143
 * Convert a wchar_t string into a multibyte utf-8 string.  The only
144
 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
145
 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
146
 * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
147
 * may also be supported.
148
 *
149
 * Note that the wchar_t type varies in size on different systems. On
150
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
151
 *
152
 * If an error occurs an error may, or may not be posted with CPLError().
153
 *
154
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
155
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
156
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
157
 *
158
 * @return a zero terminated multi-byte string which should be freed with
159
 * CPLFree(), or NULL if an error occurs.
160
 *
161
 */
162
163
char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
164
                                 const char *pszSrcEncoding,
165
                                 const char *pszDstEncoding)
166
167
0
{
168
0
#ifdef CPL_RECODE_ICONV
169
    /* -------------------------------------------------------------------- */
170
    /*      Conversions from CPL_ENC_UCS2                                   */
171
    /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
172
    /*      handled by the stub implementation.                             */
173
    /* -------------------------------------------------------------------- */
174
0
    if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
175
0
         EQUAL(pszSrcEncoding, "WCHAR_T")) &&
176
0
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
177
0
         EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
178
0
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
179
0
    {
180
0
        return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
181
0
                                      pszDstEncoding);
182
0
    }
183
184
0
    return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
185
186
#else   // CPL_RECODE_STUB
187
    return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
188
#endif  // CPL_RECODE_ICONV
189
0
}
190
191
/************************************************************************/
192
/*                          CPLRecodeToWChar()                          */
193
/************************************************************************/
194
195
/**
196
 * Convert UTF-8 string to a wchar_t string.
197
 *
198
 * Convert a 8bit, multi-byte per character input string into a wide
199
 * character (wchar_t) string.  The only guaranteed supported source encodings
200
 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
201
 * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
202
 * and destination encodings may be supported depending on the underlying
203
 * implementation.
204
 *
205
 * Note that the wchar_t type varies in size on different systems. On
206
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
207
 *
208
 * If an error occurs an error may, or may not be posted with CPLError().
209
 *
210
 * @param pszSource input multi-byte character string.
211
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
212
 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
213
 *
214
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
215
 * NULL on error.
216
 *
217
 */
218
219
wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
220
                                  const char *pszSrcEncoding,
221
                                  const char *pszDstEncoding)
222
223
0
{
224
0
#ifdef CPL_RECODE_ICONV
225
    /* -------------------------------------------------------------------- */
226
    /*      Conversions to CPL_ENC_UCS2                                     */
227
    /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
228
    /*      handled by the stub implementation.                             */
229
    /* -------------------------------------------------------------------- */
230
0
    if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
231
0
         EQUAL(pszDstEncoding, "WCHAR_T")) &&
232
0
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
233
0
         EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
234
0
         EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
235
0
    {
236
0
        return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
237
0
    }
238
239
0
    return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
240
241
#else   // CPL_RECODE_STUB
242
    return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
243
#endif  // CPL_RECODE_ICONV
244
0
}
245
246
/************************************************************************/
247
/*                               CPLIsASCII()                           */
248
/************************************************************************/
249
250
/**
251
 * Test if a string is encoded as ASCII.
252
 *
253
 * @param pabyData input string to test
254
 * @param nLen length of the input string, or -1 if the function must compute
255
 *             the string length. In which case it must be null terminated.
256
 * @return true if the string is encoded as ASCII. false otherwise
257
 *
258
 * @since GDAL 3.6.0
259
 */
260
bool CPLIsASCII(const char *pabyData, size_t nLen)
261
0
{
262
0
    if (nLen == static_cast<size_t>(-1))
263
0
        nLen = strlen(pabyData);
264
0
    for (size_t i = 0; i < nLen; ++i)
265
0
    {
266
0
        if (static_cast<unsigned char>(pabyData[i]) > 127)
267
0
            return false;
268
0
    }
269
0
    return true;
270
0
}
271
272
/************************************************************************/
273
/*                          CPLForceToASCII()                           */
274
/************************************************************************/
275
276
/**
277
 * Return a new string that is made only of ASCII characters. If non-ASCII
278
 * characters are found in the input string, they will be replaced by the
279
 * provided replacement character.
280
 *
281
 * This function does not make any assumption on the encoding of the input
282
 * string (except it must be nul-terminated if nLen equals -1, or have at
283
 * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
284
 * the input string is known to be UTF-8 encoded.
285
 *
286
 * @param pabyData input string to test
287
 * @param nLen length of the input string, or -1 if the function must compute
288
 *             the string length. In which case it must be null terminated.
289
290
 * @param chReplacementChar character which will be used when the input stream
291
 *                          contains a non ASCII character. Must be valid ASCII!
292
 *
293
 * @return a new string that must be freed with CPLFree().
294
 *
295
 */
296
char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
297
0
{
298
0
    const size_t nRealLen =
299
0
        (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
300
0
    char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
301
0
    const char *pszPtr = pabyData;
302
0
    const char *pszEnd = pabyData + nRealLen;
303
0
    size_t i = 0;
304
0
    while (pszPtr != pszEnd)
305
0
    {
306
0
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
307
0
        {
308
0
            pszOutputString[i] = chReplacementChar;
309
0
            ++pszPtr;
310
0
            ++i;
311
0
        }
312
0
        else
313
0
        {
314
0
            pszOutputString[i] = *pszPtr;
315
0
            ++pszPtr;
316
0
            ++i;
317
0
        }
318
0
    }
319
0
    pszOutputString[i] = '\0';
320
0
    return pszOutputString;
321
0
}
322
323
/************************************************************************/
324
/*                       CPLUTF8ForceToASCII()                          */
325
/************************************************************************/
326
327
/**
328
 * Return a new string that is made only of ASCII characters. If non-ASCII
329
 * characters are found in the input string, for which an "equivalent" ASCII
330
 * character is not found, they will be replaced by the provided replacement
331
 * character.
332
 *
333
 * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
334
 * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
335
 * replacements for accented characters.
336
337
 * @param pszStr NUL-terminated UTF-8 string.
338
 * @param chReplacementChar character which will be used when the input stream
339
 *                          contains a non ASCII character that cannot be
340
 *                          substituted with an equivalent ASCII character.
341
 *                          Must be valid ASCII!
342
 *
343
 * @return a new string that must be freed with CPLFree().
344
 *
345
 * @since GDAL 3.9
346
 */
347
char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
348
0
{
349
0
    static const struct
350
0
    {
351
0
        short nCodePoint;
352
0
        char chFirst;
353
0
        char chSecond;
354
0
    } aLatinCharacters[] = {
355
        // https://en.wikipedia.org/wiki/Latin-1_Supplement
356
0
        {0xC0, 'A', 0},    // Latin Capital Letter A with grave
357
0
        {0xC1, 'A', 0},    // Latin Capital letter A with acute
358
0
        {0xC2, 'A', 0},    // Latin Capital letter A with circumflex
359
0
        {0xC3, 'A', 0},    // Latin Capital letter A with tilde
360
0
        {0xC4, 'A', 0},    // Latin Capital letter A with diaeresis
361
0
        {0xC5, 'A', 0},    // Latin Capital letter A with ring above
362
0
        {0xC6, 'A', 'E'},  // Latin Capital letter AE
363
0
        {0xC7, 'C', 0},    // Latin Capital letter C with cedilla
364
0
        {0xC8, 'E', 0},    // Latin Capital letter E with grave
365
0
        {0xC9, 'E', 0},    // Latin Capital letter E with acute
366
0
        {0xCA, 'E', 0},    // Latin Capital letter E with circumflex
367
0
        {0xCB, 'E', 0},    // Latin Capital letter E with diaeresis
368
0
        {0xCC, 'I', 0},    // Latin Capital letter I with grave
369
0
        {0xCD, 'I', 0},    // Latin Capital letter I with acute
370
0
        {0xCE, 'I', 0},    // Latin Capital letter I with circumflex
371
0
        {0xCF, 'I', 0},    // Latin Capital letter I with diaeresis
372
        // { 0xD0, '?', 0 }, // Latin Capital letter Eth
373
0
        {0xD1, 'N', 0},  // Latin Capital letter N with tilde
374
0
        {0xD2, 'O', 0},  // Latin Capital letter O with grave
375
0
        {0xD3, 'O', 0},  // Latin Capital letter O with acute
376
0
        {0xD4, 'O', 0},  // Latin Capital letter O with circumflex
377
0
        {0xD5, 'O', 0},  // Latin Capital letter O with tilde
378
0
        {0xD6, 'O', 0},  // Latin Capital letter O with diaeresis
379
0
        {0xD8, 'O', 0},  // Latin Capital letter O with stroke
380
0
        {0xD9, 'U', 0},  // Latin Capital letter U with grave
381
0
        {0xDA, 'U', 0},  // Latin Capital letter U with acute
382
0
        {0xDB, 'U', 0},  // Latin Capital Letter U with circumflex
383
0
        {0xDC, 'U', 0},  // Latin Capital Letter U with diaeresis
384
0
        {0xDD, 'Y', 0},  // Latin Capital Letter Y with acute
385
        // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
386
0
        {0xDF, 'S', 'S'},  // Latin Small Letter sharp S
387
0
        {0xE0, 'a', 0},    // Latin Small Letter A with grave
388
0
        {0xE1, 'a', 0},    // Latin Small Letter A with acute
389
0
        {0xE2, 'a', 0},    // Latin Small Letter A with circumflex
390
0
        {0xE3, 'a', 0},    // Latin Small Letter A with tilde
391
0
        {0xE4, 'a', 0},    // Latin Small Letter A with diaeresis
392
0
        {0xE5, 'a', 0},    // Latin Small Letter A with ring above
393
0
        {0xE6, 'a', 'e'},  // Latin Small Letter AE
394
0
        {0xE7, 'c', 0},    // Latin Small Letter C with cedilla
395
0
        {0xE8, 'e', 0},    // Latin Small Letter E with grave
396
0
        {0xE9, 'e', 0},    // Latin Small Letter E with acute
397
0
        {0xEA, 'e', 0},    // Latin Small Letter E with circumflex
398
0
        {0xEB, 'e', 0},    // Latin Small Letter E with diaeresis
399
0
        {0xEC, 'i', 0},    // Latin Small Letter I with grave
400
0
        {0xED, 'i', 0},    // Latin Small Letter I with acute
401
0
        {0xEE, 'i', 0},    // Latin Small Letter I with circumflex
402
0
        {0xEF, 'i', 0},    // Latin Small Letter I with diaeresis
403
        // { 0xF0, '?', 0 }, // Latin Small Letter Eth
404
0
        {0xF1, 'n', 0},  // Latin Small Letter N with tilde
405
0
        {0xF2, 'o', 0},  // Latin Small Letter O with grave
406
0
        {0xF3, 'o', 0},  // Latin Small Letter O with acute
407
0
        {0xF4, 'o', 0},  // Latin Small Letter O with circumflex
408
0
        {0xF5, 'o', 0},  // Latin Small Letter O with tilde
409
0
        {0xF6, 'o', 0},  // Latin Small Letter O with diaeresis
410
0
        {0xF8, 'o', 0},  // Latin Small Letter O with stroke
411
0
        {0xF9, 'u', 0},  // Latin Small Letter U with grave
412
0
        {0xFA, 'u', 0},  // Latin Small Letter U with acute
413
0
        {0xFB, 'u', 0},  // Latin Small Letter U with circumflex
414
0
        {0xFC, 'u', 0},  // Latin Small Letter U with diaeresis
415
0
        {0xFD, 'y', 0},  // Latin Small Letter Y with acute
416
        // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
417
0
        {0xFF, 'u', 0},  // Latin Small Letter Y with diaeresis
418
419
        // https://en.wikipedia.org/wiki/Latin_Extended-A
420
0
        {
421
0
            0x0100,
422
0
            'A',
423
0
            0,
424
0
        },  // Latin Capital letter A with macron
425
0
        {
426
0
            0x0101,
427
0
            'a',
428
0
            0,
429
0
        },  // Latin Small letter A with macron
430
0
        {
431
0
            0x0102,
432
0
            'A',
433
0
            0,
434
0
        },  // Latin Capital letter A with breve
435
0
        {
436
0
            0x0103,
437
0
            'a',
438
0
            0,
439
0
        },  // Latin Small letter A with breve
440
0
        {
441
0
            0x0104,
442
0
            'A',
443
0
            0,
444
0
        },  // Latin Capital letter A with ogonek
445
0
        {
446
0
            0x0105,
447
0
            'a',
448
0
            0,
449
0
        },  // Latin Small letter A with ogonek
450
0
        {
451
0
            0x0106,
452
0
            'C',
453
0
            0,
454
0
        },  // Latin Capital letter C with acute
455
0
        {
456
0
            0x0107,
457
0
            'c',
458
0
            0,
459
0
        },  // Latin Small letter C with acute
460
0
        {
461
0
            0x0108,
462
0
            'C',
463
0
            0,
464
0
        },  // Latin Capital letter C with circumflex
465
0
        {
466
0
            0x0109,
467
0
            'c',
468
0
            0,
469
0
        },  // Latin Small letter C with circumflex
470
0
        {
471
0
            0x010A,
472
0
            'C',
473
0
            0,
474
0
        },  // Latin Capital letter C with dot above
475
0
        {
476
0
            0x010B,
477
0
            'c',
478
0
            0,
479
0
        },  // Latin Small letter C with dot above
480
0
        {
481
0
            0x010C,
482
0
            'C',
483
0
            0,
484
0
        },  // Latin Capital letter C with caron
485
0
        {
486
0
            0x010D,
487
0
            'c',
488
0
            0,
489
0
        },  // Latin Small letter C with caron
490
0
        {
491
0
            0x010E,
492
0
            'D',
493
0
            0,
494
0
        },  // Latin Capital letter D with caron
495
0
        {
496
0
            0x010F,
497
0
            'd',
498
0
            0,
499
0
        },  // Latin Small letter D with caron
500
0
        {
501
0
            0x0110,
502
0
            'D',
503
0
            0,
504
0
        },  // Latin Capital letter D with stroke
505
0
        {
506
0
            0x0111,
507
0
            'd',
508
0
            0,
509
0
        },  // Latin Small letter D with stroke
510
0
        {
511
0
            0x0112,
512
0
            'E',
513
0
            0,
514
0
        },  // Latin Capital letter E with macron
515
0
        {
516
0
            0x0113,
517
0
            'e',
518
0
            0,
519
0
        },  // Latin Small letter E with macron
520
0
        {
521
0
            0x0114,
522
0
            'E',
523
0
            0,
524
0
        },  // Latin Capital letter E with breve
525
0
        {
526
0
            0x0115,
527
0
            'e',
528
0
            0,
529
0
        },  // Latin Small letter E with breve
530
0
        {
531
0
            0x0116,
532
0
            'E',
533
0
            0,
534
0
        },  // Latin Capital letter E with dot above
535
0
        {
536
0
            0x0117,
537
0
            'e',
538
0
            0,
539
0
        },  // Latin Small letter E with dot above
540
0
        {
541
0
            0x0118,
542
0
            'E',
543
0
            0,
544
0
        },  // Latin Capital letter E with ogonek
545
0
        {
546
0
            0x0119,
547
0
            'e',
548
0
            0,
549
0
        },  // Latin Small letter E with ogonek
550
0
        {
551
0
            0x011A,
552
0
            'E',
553
0
            0,
554
0
        },  // Latin Capital letter E with caron
555
0
        {
556
0
            0x011B,
557
0
            'e',
558
0
            0,
559
0
        },  // Latin Small letter E with caron
560
0
        {
561
0
            0x011C,
562
0
            'G',
563
0
            0,
564
0
        },  // Latin Capital letter G with circumflex
565
0
        {
566
0
            0x011D,
567
0
            'g',
568
0
            0,
569
0
        },  // Latin Small letter G with circumflex
570
0
        {
571
0
            0x011E,
572
0
            'G',
573
0
            0,
574
0
        },  // Latin Capital letter G with breve
575
0
        {
576
0
            0x011F,
577
0
            'g',
578
0
            0,
579
0
        },  // Latin Small letter G with breve
580
0
        {
581
0
            0x0120,
582
0
            'G',
583
0
            0,
584
0
        },  // Latin Capital letter G with dot above
585
0
        {
586
0
            0x0121,
587
0
            'g',
588
0
            0,
589
0
        },  // Latin Small letter G with dot above
590
0
        {
591
0
            0x0122,
592
0
            'G',
593
0
            0,
594
0
        },  // Latin Capital letter G with cedilla
595
0
        {
596
0
            0x0123,
597
0
            'g',
598
0
            0,
599
0
        },  // Latin Small letter G with cedilla
600
0
        {
601
0
            0x0124,
602
0
            'H',
603
0
            0,
604
0
        },  // Latin Capital letter H with circumflex
605
0
        {
606
0
            0x0125,
607
0
            'h',
608
0
            0,
609
0
        },  // Latin Small letter H with circumflex
610
0
        {
611
0
            0x0126,
612
0
            'H',
613
0
            0,
614
0
        },  // Latin Capital letter H with stroke
615
0
        {
616
0
            0x0127,
617
0
            'h',
618
0
            0,
619
0
        },  // Latin Small letter H with stroke
620
0
        {
621
0
            0x0128,
622
0
            'I',
623
0
            0,
624
0
        },  // Latin Capital letter I with tilde
625
0
        {
626
0
            0x0129,
627
0
            'i',
628
0
            0,
629
0
        },  // Latin Small letter I with tilde
630
0
        {
631
0
            0x012A,
632
0
            'I',
633
0
            0,
634
0
        },  // Latin Capital letter I with macron
635
0
        {
636
0
            0x012B,
637
0
            'i',
638
0
            0,
639
0
        },  // Latin Small letter I with macron
640
0
        {
641
0
            0x012C,
642
0
            'I',
643
0
            0,
644
0
        },  // Latin Capital letter I with breve
645
0
        {
646
0
            0x012D,
647
0
            'i',
648
0
            0,
649
0
        },  // Latin Small letter I with breve
650
0
        {
651
0
            0x012E,
652
0
            'I',
653
0
            0,
654
0
        },  // Latin Capital letter I with ogonek
655
0
        {
656
0
            0x012F,
657
0
            'i',
658
0
            0,
659
0
        },  // Latin Small letter I with ogonek
660
0
        {
661
0
            0x0130,
662
0
            'I',
663
0
            0,
664
0
        },  // Latin Capital letter I with dot above
665
0
        {
666
0
            0x0131,
667
0
            'i',
668
0
            0,
669
0
        },  // Latin Small letter dotless I
670
0
        {
671
0
            0x0132,
672
0
            'I',
673
0
            'J',
674
0
        },  // Latin Capital Ligature IJ
675
0
        {
676
0
            0x0133,
677
0
            'i',
678
0
            'j',
679
0
        },  // Latin Small Ligature IJ
680
0
        {
681
0
            0x0134,
682
0
            'J',
683
0
            0,
684
0
        },  // Latin Capital letter J with circumflex
685
0
        {
686
0
            0x0135,
687
0
            'j',
688
0
            0,
689
0
        },  // Latin Small letter J with circumflex
690
0
        {
691
0
            0x0136,
692
0
            'K',
693
0
            0,
694
0
        },  // Latin Capital letter K with cedilla
695
0
        {
696
0
            0x0137,
697
0
            'k',
698
0
            0,
699
0
        },  // Latin Small letter K with cedilla
700
0
        {
701
0
            0x0138,
702
0
            'k',
703
0
            0,
704
0
        },  // Latin Small letter Kra
705
0
        {
706
0
            0x0139,
707
0
            'L',
708
0
            0,
709
0
        },  // Latin Capital letter L with acute
710
0
        {
711
0
            0x013A,
712
0
            'l',
713
0
            0,
714
0
        },  // Latin Small letter L with acute
715
0
        {
716
0
            0x013B,
717
0
            'L',
718
0
            0,
719
0
        },  // Latin Capital letter L with cedilla
720
0
        {
721
0
            0x013C,
722
0
            'l',
723
0
            0,
724
0
        },  // Latin Small letter L with cedilla
725
0
        {
726
0
            0x013D,
727
0
            'L',
728
0
            0,
729
0
        },  // Latin Capital letter L with caron
730
0
        {
731
0
            0x013E,
732
0
            'l',
733
0
            0,
734
0
        },  // Latin Small letter L with caron
735
0
        {
736
0
            0x013F,
737
0
            'L',
738
0
            0,
739
0
        },  // Latin Capital letter L with middle dot
740
0
        {
741
0
            0x0140,
742
0
            'l',
743
0
            0,
744
0
        },  // Latin Small letter L with middle dot
745
0
        {
746
0
            0x0141,
747
0
            'L',
748
0
            0,
749
0
        },  // Latin Capital letter L with stroke
750
0
        {
751
0
            0x0142,
752
0
            'l',
753
0
            0,
754
0
        },  // Latin Small letter L with stroke
755
0
        {
756
0
            0x0143,
757
0
            'N',
758
0
            0,
759
0
        },  // Latin Capital letter N with acute
760
0
        {
761
0
            0x0144,
762
0
            'n',
763
0
            0,
764
0
        },  // Latin Small letter N with acute
765
0
        {
766
0
            0x0145,
767
0
            'N',
768
0
            0,
769
0
        },  // Latin Capital letter N with cedilla
770
0
        {
771
0
            0x0146,
772
0
            'n',
773
0
            0,
774
0
        },  // Latin Small letter N with cedilla
775
0
        {
776
0
            0x0147,
777
0
            'N',
778
0
            0,
779
0
        },  // Latin Capital letter N with caron
780
0
        {
781
0
            0x0148,
782
0
            'n',
783
0
            0,
784
0
        },  // Latin Small letter N with caron
785
        // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
786
        // { 0x014B , '?' , 0, }, // Latin Small letter Eng
787
0
        {
788
0
            0x014C,
789
0
            'O',
790
0
            0,
791
0
        },  // Latin Capital letter O with macron
792
0
        {
793
0
            0x014D,
794
0
            'o',
795
0
            0,
796
0
        },  // Latin Small letter O with macron
797
0
        {
798
0
            0x014E,
799
0
            'O',
800
0
            0,
801
0
        },  // Latin Capital letter O with breve
802
0
        {
803
0
            0x014F,
804
0
            'o',
805
0
            0,
806
0
        },  // Latin Small letter O with breve
807
0
        {
808
0
            0x0150,
809
0
            'O',
810
0
            0,
811
0
        },  // Latin Capital Letter O with double acute
812
0
        {
813
0
            0x0151,
814
0
            'o',
815
0
            0,
816
0
        },  // Latin Small Letter O with double acute
817
0
        {
818
0
            0x0152,
819
0
            'O',
820
0
            'E',
821
0
        },  // Latin Capital Ligature OE
822
0
        {
823
0
            0x0153,
824
0
            'o',
825
0
            'e',
826
0
        },  // Latin Small Ligature OE
827
0
        {
828
0
            0x0154,
829
0
            'R',
830
0
            0,
831
0
        },  // Latin Capital letter R with acute
832
0
        {
833
0
            0x0155,
834
0
            'r',
835
0
            0,
836
0
        },  // Latin Small letter R with acute
837
0
        {
838
0
            0x0156,
839
0
            'R',
840
0
            0,
841
0
        },  // Latin Capital letter R with cedilla
842
0
        {
843
0
            0x0157,
844
0
            'r',
845
0
            0,
846
0
        },  // Latin Small letter R with cedilla
847
0
        {
848
0
            0x0158,
849
0
            'R',
850
0
            0,
851
0
        },  // Latin Capital letter R with caron
852
0
        {
853
0
            0x0159,
854
0
            'r',
855
0
            0,
856
0
        },  // Latin Small letter R with caron
857
0
        {
858
0
            0x015A,
859
0
            'S',
860
0
            0,
861
0
        },  // Latin Capital letter S with acute
862
0
        {
863
0
            0x015B,
864
0
            's',
865
0
            0,
866
0
        },  // Latin Small letter S with acute
867
0
        {
868
0
            0x015C,
869
0
            'S',
870
0
            0,
871
0
        },  // Latin Capital letter S with circumflex
872
0
        {
873
0
            0x015D,
874
0
            's',
875
0
            0,
876
0
        },  // Latin Small letter S with circumflex
877
0
        {
878
0
            0x015E,
879
0
            'S',
880
0
            0,
881
0
        },  // Latin Capital letter S with cedilla
882
0
        {
883
0
            0x015F,
884
0
            's',
885
0
            0,
886
0
        },  // Latin Small letter S with cedilla
887
0
        {
888
0
            0x0160,
889
0
            'S',
890
0
            0,
891
0
        },  // Latin Capital letter S with caron
892
0
        {
893
0
            0x0161,
894
0
            's',
895
0
            0,
896
0
        },  // Latin Small letter S with caron
897
0
        {
898
0
            0x0162,
899
0
            'T',
900
0
            0,
901
0
        },  // Latin Capital letter T with cedilla
902
0
        {
903
0
            0x0163,
904
0
            't',
905
0
            0,
906
0
        },  // Latin Small letter T with cedilla
907
0
        {
908
0
            0x0164,
909
0
            'T',
910
0
            0,
911
0
        },  // Latin Capital letter T with caron
912
0
        {
913
0
            0x0165,
914
0
            't',
915
0
            0,
916
0
        },  // Latin Small letter T with caron
917
0
        {
918
0
            0x0166,
919
0
            'T',
920
0
            0,
921
0
        },  // Latin Capital letter T with stroke
922
0
        {
923
0
            0x0167,
924
0
            't',
925
0
            0,
926
0
        },  // Latin Small letter T with stroke
927
0
        {
928
0
            0x0168,
929
0
            'U',
930
0
            0,
931
0
        },  // Latin Capital letter U with tilde
932
0
        {
933
0
            0x0169,
934
0
            'u',
935
0
            0,
936
0
        },  // Latin Small letter U with tilde
937
0
        {
938
0
            0x016A,
939
0
            'U',
940
0
            0,
941
0
        },  // Latin Capital letter U with macron
942
0
        {
943
0
            0x016B,
944
0
            'u',
945
0
            0,
946
0
        },  // Latin Small letter U with macron
947
0
        {
948
0
            0x016C,
949
0
            'U',
950
0
            0,
951
0
        },  // Latin Capital letter U with breve
952
0
        {
953
0
            0x016D,
954
0
            'u',
955
0
            0,
956
0
        },  // Latin Small letter U with breve
957
0
        {
958
0
            0x016E,
959
0
            'U',
960
0
            0,
961
0
        },  // Latin Capital letter U with ring above
962
0
        {
963
0
            0x016F,
964
0
            'u',
965
0
            0,
966
0
        },  // Latin Small letter U with ring above
967
0
        {
968
0
            0x0170,
969
0
            'U',
970
0
            0,
971
0
        },  // Latin Capital Letter U with double acute
972
0
        {
973
0
            0x0171,
974
0
            'u',
975
0
            0,
976
0
        },  // Latin Small Letter U with double acute
977
0
        {
978
0
            0x0172,
979
0
            'U',
980
0
            0,
981
0
        },  // Latin Capital letter U with ogonek
982
0
        {
983
0
            0x0173,
984
0
            'u',
985
0
            0,
986
0
        },  // Latin Small letter U with ogonek
987
0
        {
988
0
            0x0174,
989
0
            'W',
990
0
            0,
991
0
        },  // Latin Capital letter W with circumflex
992
0
        {
993
0
            0x0175,
994
0
            'w',
995
0
            0,
996
0
        },  // Latin Small letter W with circumflex
997
0
        {
998
0
            0x0176,
999
0
            'Y',
1000
0
            0,
1001
0
        },  // Latin Capital letter Y with circumflex
1002
0
        {
1003
0
            0x0177,
1004
0
            'y',
1005
0
            0,
1006
0
        },  // Latin Small letter Y with circumflex
1007
0
        {
1008
0
            0x0178,
1009
0
            'Y',
1010
0
            0,
1011
0
        },  // Latin Capital letter Y with diaeresis
1012
0
        {
1013
0
            0x0179,
1014
0
            'Z',
1015
0
            0,
1016
0
        },  // Latin Capital letter Z with acute
1017
0
        {
1018
0
            0x017A,
1019
0
            'z',
1020
0
            0,
1021
0
        },  // Latin Small letter Z with acute
1022
0
        {
1023
0
            0x017B,
1024
0
            'Z',
1025
0
            0,
1026
0
        },  // Latin Capital letter Z with dot above
1027
0
        {
1028
0
            0x017C,
1029
0
            'z',
1030
0
            0,
1031
0
        },  // Latin Small letter Z with dot above
1032
0
        {
1033
0
            0x017D,
1034
0
            'Z',
1035
0
            0,
1036
0
        },  // Latin Capital letter Z with caron
1037
0
        {
1038
0
            0x017E,
1039
0
            'z',
1040
0
            0,
1041
0
        },  // Latin Small letter Z with caron
1042
0
    };
1043
1044
0
    const size_t nLen = strlen(pszStr);
1045
0
    char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
1046
0
    const char *pszPtr = pszStr;
1047
0
    const char *pszEnd = pszStr + nLen;
1048
0
    size_t i = 0;
1049
0
    while (pszPtr != pszEnd)
1050
0
    {
1051
0
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
1052
0
        {
1053
0
            utf8_int32_t codepoint;
1054
0
            if (pszPtr + utf8codepointcalcsize(
1055
0
                             reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
1056
0
                pszEnd)
1057
0
                break;
1058
0
            auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
1059
0
                reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
1060
0
            char ch = chReplacementChar;
1061
0
            for (const auto &latin1char : aLatinCharacters)
1062
0
            {
1063
0
                if (codepoint == latin1char.nCodePoint)
1064
0
                {
1065
0
                    pszOutputString[i] = latin1char.chFirst;
1066
0
                    ++i;
1067
0
                    if (latin1char.chSecond)
1068
0
                    {
1069
0
                        pszOutputString[i] = latin1char.chSecond;
1070
0
                        ++i;
1071
0
                    }
1072
0
                    ch = 0;
1073
0
                    break;
1074
0
                }
1075
0
            }
1076
0
            if (ch)
1077
0
            {
1078
0
                pszOutputString[i] = ch;
1079
0
                ++i;
1080
0
            }
1081
0
            pszPtr = pszNext;
1082
0
        }
1083
0
        else
1084
0
        {
1085
0
            pszOutputString[i] = *pszPtr;
1086
0
            ++pszPtr;
1087
0
            ++i;
1088
0
        }
1089
0
    }
1090
0
    pszOutputString[i] = '\0';
1091
0
    return pszOutputString;
1092
0
}
1093
1094
/************************************************************************/
1095
/*                        CPLEncodingCharSize()                         */
1096
/************************************************************************/
1097
1098
/**
1099
 * Return bytes per character for encoding.
1100
 *
1101
 * This function returns the size in bytes of the smallest character
1102
 * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
1103
 * is straight forward.  For encodings like UTF8 and UTF16 which represent
1104
 * some characters as a sequence of atomic character sizes the function
1105
 * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1106
 *
1107
 * This function will return the correct value for well known encodings
1108
 * with corresponding CPL_ENC_ values.  It may not return the correct value
1109
 * for other encodings even if they are supported by the underlying iconv
1110
 * or windows transliteration services.  Hopefully it will improve over time.
1111
 *
1112
 * @param pszEncoding the name of the encoding.
1113
 *
1114
 * @return the size of a minimal character in bytes or -1 if the size is
1115
 * unknown.
1116
 */
1117
1118
int CPLEncodingCharSize(const char *pszEncoding)
1119
1120
0
{
1121
0
    if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1122
0
        return 1;
1123
0
    else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1124
0
             EQUAL(pszEncoding, "UTF-16LE"))
1125
0
        return 2;
1126
0
    else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
1127
0
        return 2;
1128
0
    else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
1129
0
        return 4;
1130
0
    else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
1131
0
        return 1;
1132
0
    else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
1133
0
        return 1;
1134
1135
0
    return -1;
1136
0
}
1137
1138
/************************************************************************/
1139
/*                    CPLClearRecodeWarningFlags()                      */
1140
/************************************************************************/
1141
1142
void CPLClearRecodeWarningFlags()
1143
0
{
1144
0
#ifdef CPL_RECODE_ICONV
1145
0
    CPLClearRecodeIconvWarningFlags();
1146
0
#endif
1147
0
    CPLClearRecodeStubWarningFlags();
1148
0
}
1149
1150
/************************************************************************/
1151
/*                         CPLStrlenUTF8()                              */
1152
/************************************************************************/
1153
1154
/**
1155
 * Return the number of UTF-8 characters of a nul-terminated string.
1156
 *
1157
 * This is different from strlen() which returns the number of bytes.
1158
 *
1159
 * @param pszUTF8Str a nul-terminated UTF-8 string
1160
 *
1161
 * @return the number of UTF-8 characters.
1162
 */
1163
1164
int CPLStrlenUTF8(const char *pszUTF8Str)
1165
0
{
1166
0
    int nCharacterCount = 0;
1167
0
    for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i)
1168
0
    {
1169
0
        if ((pszUTF8Str[i] & 0xc0) != 0x80)
1170
0
        {
1171
0
            if (nCharacterCount == INT_MAX)
1172
0
            {
1173
0
                CPLError(CE_Failure, CPLE_AppDefined,
1174
0
                         "CPLStrlenUTF8(): nCharacterCount > INT_MAX. Use "
1175
0
                         "CPLStrlenUTF8Ex() instead");
1176
0
                break;
1177
0
            }
1178
0
            ++nCharacterCount;
1179
0
        }
1180
0
    }
1181
0
    return nCharacterCount;
1182
0
}
1183
1184
/************************************************************************/
1185
/*                         CPLStrlenUTF8Ex()                            */
1186
/************************************************************************/
1187
1188
/**
1189
 * Return the number of UTF-8 characters of a nul-terminated string.
1190
 *
1191
 * This is different from strlen() which returns the number of bytes.
1192
 *
1193
 * @param pszUTF8Str a nul-terminated UTF-8 string
1194
 *
1195
 * @return the number of UTF-8 characters.
1196
 */
1197
1198
size_t CPLStrlenUTF8Ex(const char *pszUTF8Str)
1199
0
{
1200
0
    size_t nCharacterCount = 0;
1201
0
    for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i)
1202
0
    {
1203
0
        if ((pszUTF8Str[i] & 0xc0) != 0x80)
1204
0
        {
1205
0
            ++nCharacterCount;
1206
0
        }
1207
0
    }
1208
0
    return nCharacterCount;
1209
0
}
1210
1211
/************************************************************************/
1212
/*                           CPLCanRecode()                             */
1213
/************************************************************************/
1214
1215
/**
1216
 * Checks if it is possible to recode a string from one encoding to another.
1217
 *
1218
 * @param pszTestStr a NULL terminated string.
1219
 * @param pszSrcEncoding the source encoding.
1220
 * @param pszDstEncoding the destination encoding.
1221
 *
1222
 * @return a TRUE if recode is possible.
1223
 *
1224
 * @since GDAL 3.1.0
1225
 */
1226
int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
1227
                 const char *pszDstEncoding)
1228
0
{
1229
0
    CPLClearRecodeWarningFlags();
1230
0
    CPLErrorReset();
1231
1232
0
    CPLPushErrorHandler(CPLQuietErrorHandler);
1233
0
    char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
1234
0
    CPLPopErrorHandler();
1235
1236
0
    if (pszRec == nullptr)
1237
0
    {
1238
0
        return FALSE;
1239
0
    }
1240
1241
0
    CPLFree(pszRec);
1242
1243
0
    if (CPLGetLastErrorType() != 0)
1244
0
    {
1245
0
        return FALSE;
1246
0
    }
1247
1248
0
    return TRUE;
1249
0
}