Coverage Report

Created: 2025-07-07 10:01

/src/libreoffice/sal/textenc/textenc.cxx
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
/* TODO! This file should not be called textenc.c, because it is not the
21
   implementation of rtl/textenc.h.  Rather, it should be called
22
   gettextencodingdata.c. */
23
24
#include <sal/config.h>
25
26
#include <cstdlib>
27
28
#include <osl/module.hxx>
29
#include <rtl/textenc.h>
30
#include <rtl/tencinfo.h>
31
#include <sal/log.hxx>
32
#include <sal/types.h>
33
34
#include "convertsimple.hxx"
35
#include "gettextencodingdata.hxx"
36
#include "tcvtutf8.hxx"
37
#include "tenchelp.hxx"
38
39
#define NOTABUNI_START 0xFF
40
#define NOTABUNI_END 0x00
41
42
#define NOTABCHAR_START 0xFFFF
43
#define NOTABCHAR_END 0x0000
44
45
#define SAME8090UNI_START 0x80
46
#define SAME8090UNI_END 0x9F
47
sal_uInt16 const aImpl8090SameToUniTab[SAME8090UNI_END
48
                                                  - SAME8090UNI_START
49
                                                  + 1]
50
= { 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 */
51
    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
52
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 */
53
    0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F };
54
55
#define SAME8090CHAR_START 0x0080
56
#define SAME8090CHAR_END 0x009F
57
unsigned char const aImpl8090SameToCharTab[SAME8090CHAR_END
58
                                                  - SAME8090CHAR_START
59
                                                  + 1]
60
    = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x0080 */
61
        0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
62
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x0090 */
63
        0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F };
64
65
#define SAMEA0FFCHAR_START 0x00A0
66
#define SAMEA0FFCHAR_END 0x00FF
67
unsigned char const aImplA0FFSameToCharTab[SAMEA0FFCHAR_END
68
                                                  - SAMEA0FFCHAR_START
69
                                                  + 1]
70
    = { 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, /* 0x00A0 */
71
        0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
72
        0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, /* 0x00B0 */
73
        0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
74
        0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, /* 0x00C0 */
75
        0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
76
        0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, /* 0x00D0 */
77
        0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
78
        0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, /* 0x00E0 */
79
        0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
80
        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, /* 0x00F0 */
81
        0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF };
82
83
/* ======================================================================= */
84
85
/* MS-1252 */
86
/* Windows Standard CharSet (ANSI) for Western Script */
87
/* 1-Byte, 0x00-0x7F ASCII without exception */
88
/* Convert-Tables: mappings/vendors/micsft/windows/cp1252.txt from 04/15/98 Version 2.01 */
89
/* Last-Changes from us: */
90
91
/* ----------------------------------------------------------------------- */
92
93
#define MS1252UNI_START                 0x80
94
#define MS1252UNI_END                   0xFF
95
sal_uInt16 const aImplMS1252ToUniTab[MS1252UNI_END - MS1252UNI_START + 1] =
96
{
97
/*       0       1       2       3       4       5       6       7 */
98
/*       8       9       A       B       C       D       E       F */
99
    0x20AC,      0, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, /* 0x80 */
100
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,      0, 0x017D,      0, /* 0x80 */
101
         0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, /* 0x90 */
102
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153,      0, 0x017E, 0x0178, /* 0x90 */
103
    0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* 0xA0 */
104
    0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, /* 0xA0 */
105
    0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* 0xB0 */
106
    0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* 0xB0 */
107
    0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* 0xC0 */
108
    0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* 0xC0 */
109
    0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* 0xD0 */
110
    0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* 0xD0 */
111
    0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* 0xE0 */
112
    0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* 0xE0 */
113
    0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* 0xF0 */
114
    0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF  /* 0xF0 */
115
};
116
117
/* ----------------------------------------------------------------------- */
118
119
#define MS1252TOCHARTABEX_COUNT         27
120
ImplUniCharTabData const aImplMS1252ToCharTabEx[MS1252TOCHARTABEX_COUNT] =
121
{
122
  { 0x0152, 0x8C, 0 },
123
  { 0x0153, 0x9C, 0 },
124
  { 0x0160, 0x8A, 0 },
125
  { 0x0161, 0x9A, 0 },
126
  { 0x0178, 0x9F, 0 },
127
  { 0x017D, 0x8E, 0 },
128
  { 0x017E, 0x9E, 0 },
129
  { 0x0192, 0x83, 0 },
130
  { 0x02C6, 0x88, 0 },
131
  { 0x02DC, 0x98, 0 },
132
  { 0x2013, 0x96, 0 },
133
  { 0x2014, 0x97, 0 },
134
  { 0x2018, 0x91, 0 },
135
  { 0x2019, 0x92, 0 },
136
  { 0x201A, 0x82, 0 },
137
  { 0x201C, 0x93, 0 },
138
  { 0x201D, 0x94, 0 },
139
  { 0x201E, 0x84, 0 },
140
  { 0x2020, 0x86, 0 },
141
  { 0x2021, 0x87, 0 },
142
  { 0x2022, 0x95, 0 },
143
  { 0x2026, 0x85, 0 },
144
  { 0x2030, 0x89, 0 },
145
  { 0x2039, 0x8B, 0 },
146
  { 0x203A, 0x9B, 0 },
147
  { 0x20AC, 0x80, 0 },
148
  { 0x2122, 0x99, 0 },
149
};
150
151
/* ----------------------------------------------------------------------- */
152
153
ImplByteConvertData const aImplMS1252ByteCvtData =
154
{
155
    aImplMS1252ToUniTab,
156
    nullptr,
157
    aImplA0FFSameToCharTab,
158
    nullptr,
159
    aImplMS1252ToCharTabEx,
160
    MS1252UNI_START, MS1252UNI_END,
161
    NOTABUNI_START, NOTABUNI_END,
162
    SAMEA0FFCHAR_START, SAMEA0FFCHAR_END,
163
    NOTABCHAR_START, NOTABCHAR_END,
164
    MS1252TOCHARTABEX_COUNT
165
};
166
167
/* ----------------------------------------------------------------------- */
168
169
ImplTextEncodingData const aImplMS1252TextEncodingData
170
    = { { &aImplMS1252ByteCvtData,
171
          sal::detail::textenc::convertCharToUnicode,
172
          sal::detail::textenc::convertUnicodeToChar,
173
          nullptr,
174
          nullptr,
175
          nullptr,
176
          nullptr,
177
          nullptr,
178
          nullptr },
179
        "iso8859-1",
180
        "windows-1252",
181
        1,
182
        1,
183
        1,
184
        0,
185
        RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME };
186
    /* WIN, SCRIPT_LATIN, pc code page 850 */
187
188
/* ======================================================================= */
189
190
/* ISO-8859-1 */
191
/* Unix Standard CharSet (Latin1) for Western Script */
192
/* 1-Byte, 0x00-0x7F ASCII without exception, 0x80-0x9F control character like in Unicode */
193
/* Convert-Tables: mappings/iso8859/8859-1.txt from 07/27/99 Version 1.0 (based on Unicode 3.0) */
194
/* Last-Changes from us: */
195
196
#define ISO88591UNI_START               0xA0
197
#define ISO88591UNI_END                 0xFF
198
sal_uInt16 const aImplISO88591ToUniTab[ISO88591UNI_END - ISO88591UNI_START + 1] =
199
{
200
/*       0       1       2       3       4       5       6       7 */
201
/*       8       9       A       B       C       D       E       F */
202
    0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* 0xA0 */
203
    0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, /* 0xA0 */
204
    0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* 0xB0 */
205
    0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* 0xB0 */
206
    0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* 0xC0 */
207
    0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* 0xC0 */
208
    0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* 0xD0 */
209
    0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* 0xD0 */
210
    0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* 0xE0 */
211
    0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* 0xE0 */
212
    0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* 0xF0 */
213
    0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF  /* 0xF0 */
214
};
215
216
/* ----------------------------------------------------------------------- */
217
218
ImplByteConvertData const aImplISO88591ByteCvtData =
219
{
220
    aImplISO88591ToUniTab,
221
    aImpl8090SameToUniTab,
222
    aImplA0FFSameToCharTab,
223
    aImpl8090SameToCharTab,
224
    nullptr,
225
    ISO88591UNI_START, ISO88591UNI_END,
226
    SAME8090UNI_START, SAME8090UNI_END,
227
    SAMEA0FFCHAR_START, SAMEA0FFCHAR_END,
228
    SAME8090CHAR_START, SAME8090CHAR_END,
229
    0
230
};
231
232
/* ----------------------------------------------------------------------- */
233
234
ImplTextEncodingData const aImplISO88591TextEncodingData
235
    = { { &aImplISO88591ByteCvtData,
236
          sal::detail::textenc::convertCharToUnicode,
237
          sal::detail::textenc::convertUnicodeToChar,
238
          nullptr,
239
          nullptr,
240
          nullptr,
241
          nullptr,
242
          nullptr,
243
          nullptr },
244
        "iso8859-1",
245
        "iso-8859-1",
246
        1,
247
        1,
248
        1,
249
        0,
250
        RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME };
251
    /* SCRIPT_LATIN, pc code page 850 */
252
253
/* ======================================================================= */
254
255
/* US-ASCII */
256
/* 7-Bit ASCII */
257
/* 1-Byte, 0x00-0x7F ASCII without exception */
258
/* For the import we use ISO-8859-1 with MS extension (MS-1252), because */
259
/* when the 8-Bit is set, the chance, that this is an ISO-8859-1 character */
260
/* is the greatest. For the export all chars greater than 127 are not */
261
/* converted and are replaced by the replacement character. */
262
/* Last-Changes from us: */
263
264
/* ----------------------------------------------------------------------- */
265
266
ImplByteConvertData const aImplUSASCIIByteCvtData =
267
{
268
    aImplMS1252ToUniTab,
269
    nullptr,
270
    nullptr,
271
    nullptr,
272
    nullptr,
273
    MS1252UNI_START, MS1252UNI_END,
274
    NOTABUNI_START, NOTABUNI_END,
275
    NOTABCHAR_START, NOTABCHAR_END,
276
    NOTABCHAR_START, NOTABCHAR_END,
277
    0
278
};
279
280
/* ----------------------------------------------------------------------- */
281
282
ImplTextEncodingData const aImplUSASCIITextEncodingData
283
    = { { &aImplUSASCIIByteCvtData,
284
          sal::detail::textenc::convertCharToUnicode,
285
          sal::detail::textenc::convertUnicodeToChar,
286
          nullptr,
287
          nullptr,
288
          nullptr,
289
          nullptr,
290
          nullptr,
291
          nullptr },
292
        "iso8859-1",
293
        "us-ascii",
294
        1,
295
        1,
296
        1,
297
        0,
298
        RTL_TEXTENCODING_INFO_ASCII
299
            | RTL_TEXTENCODING_INFO_7BIT
300
            | RTL_TEXTENCODING_INFO_MIME };
301
    /* SCRIPT_LATIN, pc code page 437 */
302
303
ImplTextEncodingData const aImplUTF8TextEncodingData
304
    = { { nullptr,
305
          &ImplConvertUtf8ToUnicode,
306
          &ImplConvertUnicodeToUtf8,
307
          &ImplCreateUtf8ToUnicodeContext,
308
          &ImplDestroyUtf8ToUnicodeContext,
309
          &ImplResetUtf8ToUnicodeContext,
310
          &ImplCreateUnicodeToUtf8Context,
311
          &ImplDestroyUnicodeToUtf8Context,
312
          &ImplResetUnicodeToUtf8Context },
313
        "iso8859-1",
314
        "utf-8",
315
        1,
316
        6,
317
        1,
318
        0,
319
        RTL_TEXTENCODING_INFO_ASCII
320
            | RTL_TEXTENCODING_INFO_UNICODE
321
            | RTL_TEXTENCODING_INFO_MULTIBYTE
322
            | RTL_TEXTENCODING_INFO_MIME };
323
    /* SCRIPT_UNICODE, pc code page 850 */
324
325
static char aImplJavaUtf8TextConverterTag;
326
    /* The value of this tag is irrelevant.  Only its address != NULL is used to
327
       distinguish between RTL_TEXTENCODING_UTF8 and
328
       RTL_TEXTENCODING_JAVA_UTF8. */
329
330
ImplTextEncodingData const aImplJavaUtf8TextEncodingData
331
    = { { &aImplJavaUtf8TextConverterTag,
332
          &ImplConvertUtf8ToUnicode,
333
          &ImplConvertUnicodeToUtf8,
334
          &ImplCreateUtf8ToUnicodeContext,
335
          &ImplDestroyUtf8ToUnicodeContext,
336
          &ImplResetUtf8ToUnicodeContext,
337
          &ImplCreateUnicodeToUtf8Context,
338
          &ImplDestroyUnicodeToUtf8Context,
339
          &ImplResetUnicodeToUtf8Context },
340
        nullptr,
341
        nullptr,
342
        1,
343
        3,
344
        1,
345
        0,
346
        RTL_TEXTENCODING_INFO_UNICODE | RTL_TEXTENCODING_INFO_MULTIBYTE };
347
348
namespace {
349
350
#ifndef COND_LIB_SAL_TEXTENC
351
352
extern "C" ImplTextEncodingData const * sal_getFullTextEncodingData(
353
    rtl_TextEncoding); // from tables.cxx in sal_textenc library
354
355
class FullTextEncodingData {
356
public:
357
10.9M
    ImplTextEncodingData const * get(rtl_TextEncoding encoding) {
358
10.9M
        (void) this; // loplugin:staticmethods
359
10.9M
        return sal_getFullTextEncodingData(encoding);
360
10.9M
    }
361
    FullTextEncodingData() = default;
362
    FullTextEncodingData(const FullTextEncodingData&) = delete;
363
    FullTextEncodingData& operator=(const FullTextEncodingData&) = delete;
364
};
365
366
#else
367
368
extern "C" {
369
370
typedef ImplTextEncodingData const * TextEncodingFunction(rtl_TextEncoding);
371
372
void thisModule() {}
373
374
}
375
376
class FullTextEncodingData {
377
public:
378
    FullTextEncodingData() {
379
        if (!module_.loadRelative(&thisModule, SAL_MODULENAME("sal_textenclo")))
380
        {
381
            SAL_WARN( "sal.textenc", "Loading sal_textenc library failed" );
382
            std::abort();
383
        }
384
        function_ = reinterpret_cast< TextEncodingFunction * >(
385
            module_.getFunctionSymbol("sal_getFullTextEncodingData"));
386
        if (function_ == nullptr) {
387
            SAL_WARN( "sal.textenc", "Obtaining sal_getFullTextEncodingData function from sal_textenc"
388
                " library failed");
389
            std::abort();
390
        }
391
    }
392
393
    ImplTextEncodingData const * get(rtl_TextEncoding encoding) const {
394
        return (*function_)(encoding);
395
    }
396
397
    FullTextEncodingData(const FullTextEncodingData&) = delete;
398
    FullTextEncodingData& operator=(const FullTextEncodingData&) = delete;
399
400
private:
401
    osl::Module module_;
402
    TextEncodingFunction * function_;
403
};
404
405
#endif
406
407
}
408
409
ImplTextEncodingData const *
410
Impl_getTextEncodingData(rtl_TextEncoding nEncoding)
411
125M
{
412
125M
    switch(nEncoding)
413
125M
    {
414
68.4M
        case RTL_TEXTENCODING_ASCII_US:
415
68.4M
            return &aImplUSASCIITextEncodingData; break;
416
38.9M
        case RTL_TEXTENCODING_MS_1252:
417
38.9M
            return &aImplMS1252TextEncodingData; break;
418
2.79M
        case RTL_TEXTENCODING_UTF8:
419
2.79M
            return &aImplUTF8TextEncodingData; break;
420
14
        case RTL_TEXTENCODING_JAVA_UTF8:
421
14
            return &aImplJavaUtf8TextEncodingData; break;
422
4.76M
        case RTL_TEXTENCODING_ISO_8859_1:
423
4.76M
            return &aImplISO88591TextEncodingData; break;
424
10.9M
        default:
425
10.9M
        {
426
10.9M
            static FullTextEncodingData gFullTextEncodingData;
427
10.9M
            return gFullTextEncodingData.get(nEncoding);
428
0
        }
429
125M
    }
430
125M
}
431
432
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */