Coverage Report

Created: 2025-11-16 09:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libreoffice/sax/source/expatwrap/xml2utf.cxx
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
#include <string.h>
20
21
#include <algorithm>
22
23
#include <sal/types.h>
24
25
#include <rtl/textenc.h>
26
#include <rtl/tencinfo.h>
27
#include <com/sun/star/io/NotConnectedException.hpp>
28
#include <com/sun/star/io/XInputStream.hpp>
29
#include <xml2utf.hxx>
30
#include <memory>
31
32
33
using namespace ::com::sun::star::uno;
34
using namespace ::com::sun::star::io;
35
36
37
namespace sax_expatwrap {
38
39
sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
40
534k
{
41
534k
    if( ! m_in.is() ) {
42
0
        throw NotConnectedException();
43
0
    }
44
534k
    if( ! m_bStarted ) {
45
        // it should be possible to find the encoding attribute
46
        // within the first 512 bytes == 128 chars in UCS-4
47
261k
        nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
48
261k
    }
49
50
534k
    sal_Int32 nRead;
51
534k
    Sequence< sal_Int8 > seqStart;
52
534k
    while( true )
53
534k
    {
54
534k
        nRead = m_in->readSomeBytes( seq , nMaxToRead );
55
56
534k
        if( nRead + seqStart.getLength())
57
286k
        {
58
            // if nRead is 0, the file is already eof.
59
286k
            if( ! m_bStarted && nRead )
60
255k
            {
61
                // ensure that enough data is available to parse encoding
62
255k
                if( seqStart.hasElements() )
63
101
                {
64
                  // prefix with what we had so far.
65
101
                  sal_Int32 nLength = seq.getLength();
66
101
                  seq.realloc( seqStart.getLength() + nLength );
67
68
101
                  memmove (seq.getArray() + seqStart.getLength(),
69
101
                       seq.getConstArray(),
70
101
                       nLength);
71
101
                  memcpy  (seq.getArray(),
72
101
                       seqStart.getConstArray(),
73
101
                       seqStart.getLength());
74
101
                }
75
76
                // autodetection with the first bytes
77
255k
                if( ! isEncodingRecognizable( seq ) )
78
325
                {
79
                  // remember what we have so far.
80
325
                  seqStart = seq;
81
82
                  // read more !
83
325
                  continue;
84
325
                }
85
254k
                if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
86
                    // initialize decoding
87
209k
                    initializeDecoding();
88
209k
                }
89
254k
                seqStart = Sequence < sal_Int8 > ();
90
254k
            }
91
92
            // do the encoding
93
286k
            if( m_pText2Unicode && m_pUnicode2Text &&
94
41.4k
                m_pText2Unicode->canContinue() ) {
95
96
39.4k
                Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
97
39.4k
                seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
98
39.4k
            }
99
100
286k
            if( ! m_bStarted )
101
255k
            {
102
                // it must now be ensured, that no encoding attribute exist anymore
103
                // ( otherwise the expat-Parser will crash )
104
                // This must be done after decoding !
105
                // ( e.g. Files decoded in ucs-4 cannot be read properly )
106
255k
                m_bStarted = true;
107
255k
                removeEncoding( seq );
108
255k
            }
109
286k
            nRead = seq.getLength();
110
286k
        }
111
112
534k
        break;
113
534k
    }
114
534k
    return nRead;
115
534k
}
116
117
void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
118
255k
{
119
255k
    const sal_Int8 *pSource = seq.getArray();
120
255k
    if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
121
55.9k
        return;
122
123
    // scan for encoding
124
199k
    OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
125
126
    // cut sequence to first line break
127
    // find first line break;
128
199k
    int nMax = str.indexOf( 10 );
129
199k
    if( nMax >= 0 )
130
192k
    {
131
192k
        str = str.copy( 0 , nMax );
132
192k
    }
133
134
199k
    int nFound = str.indexOf( " encoding" );
135
199k
    if( nFound < 0 )        return;
136
137
194k
    int nStop;
138
194k
    int nStart = str.indexOf( "\"" , nFound );
139
194k
    if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
140
168k
    {
141
168k
        nStart = str.indexOf( "'" , nFound );
142
168k
        nStop  = str.indexOf( "'" , nStart +1 );
143
168k
    }
144
26.2k
    else
145
26.2k
    {
146
26.2k
        nStop  = str.indexOf( "\"" , nStart +1);
147
26.2k
    }
148
149
194k
    if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
150
28.6k
    {
151
        // remove encoding tag from file
152
28.6k
        memmove(        &( seq.getArray()[nFound] ) ,
153
28.6k
                        &( seq.getArray()[nStop+1]) ,
154
28.6k
                        seq.getLength() - nStop -1);
155
28.6k
        seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
156
28.6k
    }
157
194k
}
158
159
// Checks, if enough data has been accumulated to recognize the encoding
160
bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
161
255k
{
162
255k
    const sal_Int8 *pSource = seq.getConstArray();
163
255k
    bool bCheckIfFirstClosingBracketExists = false;
164
165
255k
    if( seq.getLength() < 8 ) {
166
        // no recognition possible, when less than 8 bytes are available
167
156
        return false;
168
156
    }
169
170
255k
    if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
171
        // scan if the <?xml tag finishes within this buffer
172
195k
        bCheckIfFirstClosingBracketExists = true;
173
195k
    }
174
59.8k
    else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
175
41.2k
             ('?' == pSource[4] || '?' == pSource[6] ) )
176
535
    {
177
        // check for utf-16
178
535
        bCheckIfFirstClosingBracketExists = true;
179
535
    }
180
59.3k
    else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
181
16.2k
             ( '?' == pSource[5] || '?' == pSource[7] ) )
182
114
    {
183
        // check for
184
114
        bCheckIfFirstClosingBracketExists = true;
185
114
    }
186
187
255k
    if( bCheckIfFirstClosingBracketExists )
188
195k
    {
189
        // whole <?xml tag is valid
190
195k
        return std::find(seq.begin(), seq.end(), '>') != seq.end();
191
195k
    }
192
193
    // No <? tag in front, no need for a bigger buffer
194
59.2k
    return true;
195
255k
}
196
197
bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
198
254k
{
199
254k
    const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
200
254k
    bool bReturn = true;
201
202
254k
    if( seq.getLength() < 4 ) {
203
        // no recognition possible, when less than 4 bytes are available
204
0
        return false;
205
0
    }
206
207
    // first level : detect possible file formats
208
254k
    if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
209
        // scan for encoding
210
195k
        OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
211
212
        // cut sequence to first line break
213
        //find first line break;
214
195k
        int nMax = str.indexOf( 10 );
215
195k
        if( nMax >= 0 )
216
191k
        {
217
191k
            str = str.copy( 0 , nMax );
218
191k
        }
219
220
195k
        int nFound = str.indexOf( " encoding" );
221
195k
        if( nFound >= 0 ) {
222
190k
            int nStop;
223
190k
            int nStart = str.indexOf( "\"" , nFound );
224
190k
            if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
225
164k
            {
226
164k
                nStart = str.indexOf( "'" , nFound );
227
164k
                nStop  = str.indexOf( "'" , nStart +1 );
228
164k
            }
229
26.1k
            else
230
26.1k
            {
231
26.1k
                nStop  = str.indexOf( "\"" , nStart +1);
232
26.1k
            }
233
190k
            if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
234
28.6k
            {
235
                // encoding found finally
236
28.6k
                m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
237
28.6k
            }
238
190k
        }
239
195k
    }
240
59.7k
    else if( 0xFE == pSource[0] &&
241
544
             0xFF == pSource[1] ) {
242
        // UTF-16 big endian
243
        // conversion is done so that encoding information can be easily extracted
244
535
        m_sEncoding = "utf-16"_ostr;
245
535
    }
246
59.2k
    else if( 0xFF == pSource[0] &&
247
285
             0xFE == pSource[1] ) {
248
        // UTF-16 little endian
249
        // conversion is done so that encoding information can be easily extracted
250
275
        m_sEncoding = "utf-16"_ostr;
251
275
    }
252
58.9k
    else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
253
        // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
254
        // The byte order mark is simply added
255
256
        // simply add the byte order mark !
257
210
        seq.realloc( seq.getLength() + 2 );
258
210
        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
259
210
        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
260
210
        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
261
262
210
        m_sEncoding = "utf-16"_ostr;
263
210
    }
264
58.7k
    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
265
        // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
266
        // The byte order mark is simply added
267
268
363
        seq.realloc( seq.getLength() + 2 );
269
363
        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
270
363
        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
271
363
        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
272
273
363
        m_sEncoding = "utf-16"_ostr;
274
363
    }
275
58.4k
    else if( 0xEF == pSource[0] &&
276
12.5k
             0xBB == pSource[1] &&
277
12.5k
             0xBF == pSource[2] )
278
12.5k
    {
279
        // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
280
        // The BOM is removed.
281
12.5k
        memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
282
12.5k
        seq.realloc( seq.getLength() - 3 );
283
12.5k
        m_sEncoding = "utf-8"_ostr;
284
12.5k
    }
285
45.8k
    else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
286
        // UCS-4 big endian
287
6
        m_sEncoding = "ucs-4"_ostr;
288
6
    }
289
45.8k
    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
290
        // UCS-4 little endian
291
9
        m_sEncoding = "ucs-4"_ostr;
292
9
    }
293
/* TODO: no need to test for the moment since we return sal_False like default case anyway
294
    else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
295
             0xa7 == static_cast<unsigned char> (pSource[2]) &&
296
             0x94 == static_cast<unsigned char> (pSource[3]) ) {
297
        // EBCDIC
298
        bReturn = sal_False;   // must be extended
299
    }
300
*/
301
45.8k
    else {
302
        // other
303
        // UTF8 is directly recognized by the parser.
304
45.8k
        bReturn = false;
305
45.8k
    }
306
307
254k
    return bReturn;
308
254k
}
309
310
void XMLFile2UTFConverter::initializeDecoding()
311
209k
{
312
313
209k
    if( !m_sEncoding.isEmpty() )
314
42.6k
    {
315
42.6k
        rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
316
42.6k
        if( encoding != RTL_TEXTENCODING_UTF8 )
317
29.9k
        {
318
29.9k
            m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
319
29.9k
            m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
320
29.9k
        }
321
42.6k
    }
322
209k
}
323
324
325
// Text2UnicodeConverter
326
327
328
Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
329
29.9k
    : m_convText2Unicode(nullptr)
330
29.9k
    , m_contextText2Unicode(nullptr)
331
29.9k
{
332
29.9k
    rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
333
29.9k
    if( RTL_TEXTENCODING_DONTKNOW == encoding )
334
1.72k
    {
335
1.72k
        m_bCanContinue = false;
336
1.72k
        m_bInitialized = false;
337
1.72k
    }
338
28.2k
    else
339
28.2k
    {
340
28.2k
        init( encoding );
341
28.2k
    }
342
29.9k
}
343
344
Text2UnicodeConverter::~Text2UnicodeConverter()
345
29.9k
{
346
29.9k
    if( m_bInitialized )
347
28.2k
    {
348
28.2k
        rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
349
28.2k
        rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
350
28.2k
    }
351
29.9k
}
352
353
void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
354
28.2k
{
355
28.2k
    m_bCanContinue = true;
356
28.2k
    m_bInitialized = true;
357
358
28.2k
    m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
359
28.2k
    m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
360
28.2k
}
361
362
363
Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
364
39.4k
{
365
39.4k
    sal_uInt32 uiInfo;
366
39.4k
    sal_Size nSrcCvtBytes   = 0;
367
39.4k
    sal_Size nTargetCount   = 0;
368
39.4k
    sal_Size nSourceCount   = 0;
369
370
    // the whole source size
371
39.4k
    sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
372
39.4k
    Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
373
374
39.4k
    const sal_Int8 *pbSource = seqText.getConstArray();
375
39.4k
    std::unique_ptr<sal_Int8[]> pbTempMem;
376
377
39.4k
    if( m_seqSource.hasElements() ) {
378
        // put old rest and new byte sequence into one array
379
25
        pbTempMem.reset(new sal_Int8[ nSourceSize ]);
380
25
        memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
381
25
        memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
382
25
        pbSource = pbTempMem.get();
383
384
        // set to zero again
385
25
        m_seqSource = Sequence< sal_Int8 >();
386
25
    }
387
388
39.4k
    while( true ) {
389
390
        /* All invalid characters are transformed to the unicode undefined char */
391
39.4k
        nTargetCount +=     rtl_convertTextToUnicode(
392
39.4k
                                    m_convText2Unicode,
393
39.4k
                                    m_contextText2Unicode,
394
39.4k
                                    reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
395
39.4k
                                    nSourceSize - nSourceCount ,
396
39.4k
                                    &( seqUnicode.getArray()[ nTargetCount ] ),
397
39.4k
                                    seqUnicode.getLength() - nTargetCount,
398
39.4k
                                    RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
399
39.4k
                                    RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
400
39.4k
                                    RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
401
39.4k
                                    &uiInfo,
402
39.4k
                                    &nSrcCvtBytes );
403
39.4k
        nSourceCount += nSrcCvtBytes;
404
405
39.4k
        if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
406
            // save necessary bytes for next conversion
407
0
            seqUnicode.realloc( seqUnicode.getLength() * 2 );
408
0
            continue;
409
0
        }
410
39.4k
        break;
411
39.4k
    }
412
39.4k
    if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
413
840
        m_seqSource.realloc( nSourceSize - nSourceCount );
414
840
        memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
415
840
    }
416
417
    // set to correct unicode size
418
39.4k
    seqUnicode.realloc( nTargetCount );
419
420
39.4k
    return seqUnicode;
421
39.4k
}
422
423
424
// Unicode2TextConverter
425
426
427
Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
428
29.9k
{
429
29.9k
    m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
430
29.9k
    m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
431
29.9k
}
432
433
434
Unicode2TextConverter::~Unicode2TextConverter()
435
29.9k
{
436
29.9k
    rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
437
29.9k
    rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
438
29.9k
}
439
440
441
Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
442
39.4k
{
443
39.4k
    std::unique_ptr<sal_Unicode[]> puTempMem;
444
445
39.4k
    if( m_seqSource.hasElements() ) {
446
        // For surrogates !
447
        // put old rest and new byte sequence into one array
448
        // In general when surrogates are used, they should be rarely
449
        // cut off between two convert()-calls. So this code is used
450
        // rarely and the extra copy is acceptable.
451
0
        puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
452
0
        memcpy( puTempMem.get() ,
453
0
                m_seqSource.getConstArray() ,
454
0
                m_seqSource.getLength() * sizeof( sal_Unicode ) );
455
0
        memcpy(
456
0
            &(puTempMem[ m_seqSource.getLength() ]) ,
457
0
            puSource ,
458
0
            nSourceSize*sizeof( sal_Unicode ) );
459
0
        puSource = puTempMem.get();
460
0
        nSourceSize += m_seqSource.getLength();
461
462
0
        m_seqSource = Sequence< sal_Unicode > ();
463
0
    }
464
465
466
39.4k
    sal_Size nTargetCount = 0;
467
39.4k
    sal_Size nSourceCount = 0;
468
469
39.4k
    sal_uInt32 uiInfo;
470
39.4k
    sal_Size nSrcCvtChars;
471
472
    // take nSourceSize * 3 as preference
473
    // this is an upper boundary for converting to utf8,
474
    // which most often used as the target.
475
39.4k
    sal_Int32 nSeqSize =  nSourceSize * 3;
476
477
39.4k
    Sequence<sal_Int8>  seqText( nSeqSize );
478
39.4k
    char *pTarget = reinterpret_cast<char *>(seqText.getArray());
479
39.4k
    while( true ) {
480
481
39.4k
        nTargetCount += rtl_convertUnicodeToText(
482
39.4k
                                    m_convUnicode2Text,
483
39.4k
                                    m_contextUnicode2Text,
484
39.4k
                                    &( puSource[nSourceCount] ),
485
39.4k
                                    nSourceSize - nSourceCount ,
486
39.4k
                                    &( pTarget[nTargetCount] ),
487
39.4k
                                    nSeqSize - nTargetCount,
488
39.4k
                                    RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
489
39.4k
                                    RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
490
39.4k
                                    &uiInfo,
491
39.4k
                                    &nSrcCvtChars);
492
39.4k
        nSourceCount += nSrcCvtChars;
493
494
39.4k
        if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
495
0
            nSeqSize = nSeqSize *2;
496
0
            seqText.realloc( nSeqSize );  // double array size
497
0
            pTarget = reinterpret_cast<char *>(seqText.getArray());
498
0
            continue;
499
0
        }
500
39.4k
        break;
501
39.4k
    }
502
503
    // for surrogates
504
39.4k
    if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
505
0
        m_seqSource.realloc( nSourceSize - nSourceCount );
506
0
        memcpy( m_seqSource.getArray() ,
507
0
                &(puSource[nSourceCount]),
508
0
                (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
509
0
    }
510
511
    // reduce the size of the buffer (fast, no copy necessary)
512
39.4k
    seqText.realloc( nTargetCount );
513
514
39.4k
    return seqText;
515
39.4k
}
516
517
}
518
519
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */