/src/libreoffice/sax/source/expatwrap/xml2utf.cxx

Source
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */
#include <string.h>

#include <algorithm>

#include <sal/types.h>

#include <rtl/textenc.h>
#include <rtl/tencinfo.h>
#include <com/sun/star/io/NotConnectedException.hpp>
#include <com/sun/star/io/XInputStream.hpp>
#include <xml2utf.hxx>
#include <memory>


using namespace ::com::sun::star::uno;
using namespace ::com::sun::star::io;


namespace sax_expatwrap {

sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
{
    if( ! m_in.is() ) {
        throw NotConnectedException();
    }
    if( ! m_bStarted ) {
        // it should be possible to find the encoding attribute
        // within the first 512 bytes == 128 chars in UCS-4
        nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
    }

    sal_Int32 nRead;
    Sequence< sal_Int8 > seqStart;
    while( true )
    {
        nRead = m_in->readSomeBytes( seq , nMaxToRead );

        if( nRead + seqStart.getLength())
        {
            // if nRead is 0, the file is already eof.
            if( ! m_bStarted && nRead )
            {
                // ensure that enough data is available to parse encoding
                if( seqStart.hasElements() )
                {
                  // prefix with what we had so far.
                  sal_Int32 nLength = seq.getLength();
                  seq.realloc( seqStart.getLength() + nLength );

                  memmove (seq.getArray() + seqStart.getLength(),
                       seq.getConstArray(),
                       nLength);
                  memcpy  (seq.getArray(),
                       seqStart.getConstArray(),
                       seqStart.getLength());
                }

                // autodetection with the first bytes
                if( ! isEncodingRecognizable( seq ) )
                {
                  // remember what we have so far.
                  seqStart = seq;

                  // read more !
                  continue;
                }
                if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
                    // initialize decoding
                    initializeDecoding();
                }
                seqStart = Sequence < sal_Int8 > ();
            }

            // do the encoding
            if( m_pText2Unicode && m_pUnicode2Text &&
                m_pText2Unicode->canContinue() ) {

                Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
                seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
            }

            if( ! m_bStarted )
            {
                // it must now be ensured, that no encoding attribute exist anymore
                // ( otherwise the expat-Parser will crash )
                // This must be done after decoding !
                // ( e.g. Files decoded in ucs-4 cannot be read properly )
                m_bStarted = true;
                removeEncoding( seq );
            }
            nRead = seq.getLength();
        }

        break;
    }
    return nRead;
}

void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
{
    const sal_Int8 *pSource = seq.getArray();
    if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
        return;

    // scan for encoding
    OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );

    // cut sequence to first line break
    // find first line break;
    int nMax = str.indexOf( 10 );
    if( nMax >= 0 )
    {
        str = str.copy( 0 , nMax );
    }

    int nFound = str.indexOf( " encoding" );
    if( nFound < 0 )        return;

    int nStop;
    int nStart = str.indexOf( "\"" , nFound );
    if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
    {
        nStart = str.indexOf( "'" , nFound );
        nStop  = str.indexOf( "'" , nStart +1 );
    }
    else
    {
        nStop  = str.indexOf( "\"" , nStart +1);
    }

    if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
    {
        // remove encoding tag from file
        memmove(        &( seq.getArray()[nFound] ) ,
                        &( seq.getArray()[nStop+1]) ,
                        seq.getLength() - nStop -1);
        seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
    }
}

// Checks, if enough data has been accumulated to recognize the encoding
bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
{
    const sal_Int8 *pSource = seq.getConstArray();
    bool bCheckIfFirstClosingBracketExists = false;

    if( seq.getLength() < 8 ) {
        // no recognition possible, when less than 8 bytes are available
        return false;
    }

    if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
        // scan if the <?xml tag finishes within this buffer
        bCheckIfFirstClosingBracketExists = true;
    }
    else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
             ('?' == pSource[4] || '?' == pSource[6] ) )
    {
        // check for utf-16
        bCheckIfFirstClosingBracketExists = true;
    }
    else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
             ( '?' == pSource[5] || '?' == pSource[7] ) )
    {
        // check for
        bCheckIfFirstClosingBracketExists = true;
    }

    if( bCheckIfFirstClosingBracketExists )
    {
        // whole <?xml tag is valid
        return std::find(seq.begin(), seq.end(), '>') != seq.end();
    }

    // No <? tag in front, no need for a bigger buffer
    return true;
}

bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
{
    const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
    bool bReturn = true;

    if( seq.getLength() < 4 ) {
        // no recognition possible, when less than 4 bytes are available
        return false;
    }

    // first level : detect possible file formats
    if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
        // scan for encoding
        OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );

        // cut sequence to first line break
        //find first line break;
        int nMax = str.indexOf( 10 );
        if( nMax >= 0 )
        {
            str = str.copy( 0 , nMax );
        }

        int nFound = str.indexOf( " encoding" );
        if( nFound >= 0 ) {
            int nStop;
            int nStart = str.indexOf( "\"" , nFound );
            if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
            {
                nStart = str.indexOf( "'" , nFound );
                nStop  = str.indexOf( "'" , nStart +1 );
            }
            else
            {
                nStop  = str.indexOf( "\"" , nStart +1);
            }
            if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
            {
                // encoding found finally
                m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
            }
        }
    }
    else if( 0xFE == pSource[0] &&
             0xFF == pSource[1] ) {
        // UTF-16 big endian
        // conversion is done so that encoding information can be easily extracted
        m_sEncoding = "utf-16"_ostr;
    }
    else if( 0xFF == pSource[0] &&
             0xFE == pSource[1] ) {
        // UTF-16 little endian
        // conversion is done so that encoding information can be easily extracted
        m_sEncoding = "utf-16"_ostr;
    }
    else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
        // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
        // The byte order mark is simply added

        // simply add the byte order mark !
        seq.realloc( seq.getLength() + 2 );
        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;

        m_sEncoding = "utf-16"_ostr;
    }
    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
        // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
        // The byte order mark is simply added

        seq.realloc( seq.getLength() + 2 );
        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;

        m_sEncoding = "utf-16"_ostr;
    }
    else if( 0xEF == pSource[0] &&
             0xBB == pSource[1] &&
             0xBF == pSource[2] )
    {
        // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
        // The BOM is removed.
        memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
        seq.realloc( seq.getLength() - 3 );
        m_sEncoding = "utf-8"_ostr;
    }
    else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
        // UCS-4 big endian
        m_sEncoding = "ucs-4"_ostr;
    }
    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
        // UCS-4 little endian
        m_sEncoding = "ucs-4"_ostr;
    }
/* TODO: no need to test for the moment since we return sal_False like default case anyway
    else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
             0xa7 == static_cast<unsigned char> (pSource[2]) &&
             0x94 == static_cast<unsigned char> (pSource[3]) ) {
        // EBCDIC
        bReturn = sal_False;   // must be extended
    }
*/
    else {
        // other
        // UTF8 is directly recognized by the parser.
        bReturn = false;
    }

    return bReturn;
}

void XMLFile2UTFConverter::initializeDecoding()
{

    if( !m_sEncoding.isEmpty() )
    {
        rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
        if( encoding != RTL_TEXTENCODING_UTF8 )
        {
            m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
            m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
        }
    }
}


// Text2UnicodeConverter


Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
    : m_convText2Unicode(nullptr)
    , m_contextText2Unicode(nullptr)
{
    rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
    if( RTL_TEXTENCODING_DONTKNOW == encoding )
    {
        m_bCanContinue = false;
        m_bInitialized = false;
    }
    else
    {
        init( encoding );
    }
}

Text2UnicodeConverter::~Text2UnicodeConverter()
{
    if( m_bInitialized )
    {
        rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
        rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
    }
}

void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
{
    m_bCanContinue = true;
    m_bInitialized = true;

    m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
    m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
}


Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
{
    sal_uInt32 uiInfo;
    sal_Size nSrcCvtBytes   = 0;
    sal_Size nTargetCount   = 0;
    sal_Size nSourceCount   = 0;

    // the whole source size
    sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
    Sequence<sal_Unicode>   seqUnicode ( nSourceSize );

    const sal_Int8 *pbSource = seqText.getConstArray();
    std::unique_ptr<sal_Int8[]> pbTempMem;

    if( m_seqSource.hasElements() ) {
        // put old rest and new byte sequence into one array
        pbTempMem.reset(new sal_Int8[ nSourceSize ]);
        memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
        memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
        pbSource = pbTempMem.get();

        // set to zero again
        m_seqSource = Sequence< sal_Int8 >();
    }

    while( true ) {

        /* All invalid characters are transformed to the unicode undefined char */
        nTargetCount +=     rtl_convertTextToUnicode(
                                    m_convText2Unicode,
                                    m_contextText2Unicode,
                                    reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
                                    nSourceSize - nSourceCount ,
                                    &( seqUnicode.getArray()[ nTargetCount ] ),
                                    seqUnicode.getLength() - nTargetCount,
                                    RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
                                    RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
                                    RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
                                    &uiInfo,
                                    &nSrcCvtBytes );
        nSourceCount += nSrcCvtBytes;

        if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
            // save necessary bytes for next conversion
            seqUnicode.realloc( seqUnicode.getLength() * 2 );
            continue;
        }
        break;
    }
    if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
        m_seqSource.realloc( nSourceSize - nSourceCount );
        memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
    }

    // set to correct unicode size
    seqUnicode.realloc( nTargetCount );

    return seqUnicode;
}


// Unicode2TextConverter


Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
{
    m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
    m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
}


Unicode2TextConverter::~Unicode2TextConverter()
{
    rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
    rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
}


Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
{
    std::unique_ptr<sal_Unicode[]> puTempMem;

    if( m_seqSource.hasElements() ) {
        // For surrogates !
        // put old rest and new byte sequence into one array
        // In general when surrogates are used, they should be rarely
        // cut off between two convert()-calls. So this code is used
        // rarely and the extra copy is acceptable.
        puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
        memcpy( puTempMem.get() ,
                m_seqSource.getConstArray() ,
                m_seqSource.getLength() * sizeof( sal_Unicode ) );
        memcpy(
            &(puTempMem[ m_seqSource.getLength() ]) ,
            puSource ,
            nSourceSize*sizeof( sal_Unicode ) );
        puSource = puTempMem.get();
        nSourceSize += m_seqSource.getLength();

        m_seqSource = Sequence< sal_Unicode > ();
    }


    sal_Size nTargetCount = 0;
    sal_Size nSourceCount = 0;

    sal_uInt32 uiInfo;
    sal_Size nSrcCvtChars;

    // take nSourceSize * 3 as preference
    // this is an upper boundary for converting to utf8,
    // which most often used as the target.
    sal_Int32 nSeqSize =  nSourceSize * 3;

    Sequence<sal_Int8>  seqText( nSeqSize );
    char *pTarget = reinterpret_cast<char *>(seqText.getArray());
    while( true ) {

        nTargetCount += rtl_convertUnicodeToText(
                                    m_convUnicode2Text,
                                    m_contextUnicode2Text,
                                    &( puSource[nSourceCount] ),
                                    nSourceSize - nSourceCount ,
                                    &( pTarget[nTargetCount] ),
                                    nSeqSize - nTargetCount,
                                    RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
                                    RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
                                    &uiInfo,
                                    &nSrcCvtChars);
        nSourceCount += nSrcCvtChars;

        if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
            nSeqSize = nSeqSize *2;
            seqText.realloc( nSeqSize );  // double array size
            pTarget = reinterpret_cast<char *>(seqText.getArray());
            continue;
        }
        break;
    }

    // for surrogates
    if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
        m_seqSource.realloc( nSourceSize - nSourceCount );
        memcpy( m_seqSource.getArray() ,
                &(puSource[nSourceCount]),
                (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
    }

    // reduce the size of the buffer (fast, no copy necessary)
    seqText.realloc( nTargetCount );

    return seqText;
}

}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Coverage Report

Created: 2025-11-16 09:57

Line	Count	Source
1		/* -- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- */
2		/*
3		* This file is part of the LibreOffice project.
4		*
5		* This Source Code Form is subject to the terms of the Mozilla Public
6		* License, v. 2.0. If a copy of the MPL was not distributed with this
7		* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8		*
9		* This file incorporates work covered by the following license notice:
10		*
11		* Licensed to the Apache Software Foundation (ASF) under one or more
12		* contributor license agreements. See the NOTICE file distributed
13		* with this work for additional information regarding copyright
14		* ownership. The ASF licenses this file to you under the Apache
15		* License, Version 2.0 (the "License"); you may not use this file
16		* except in compliance with the License. You may obtain a copy of
17		* the License at http://www.apache.org/licenses/LICENSE-2.0 .
18		*/
19		#include <string.h>
20
21		#include <algorithm>
22
23		#include <sal/types.h>
24
25		#include <rtl/textenc.h>
26		#include <rtl/tencinfo.h>
27		#include <com/sun/star/io/NotConnectedException.hpp>
28		#include <com/sun/star/io/XInputStream.hpp>
29		#include <xml2utf.hxx>
30		#include <memory>
31
32
33		using namespace ::com::sun::star::uno;
34		using namespace ::com::sun::star::io;
35
36
37		namespace sax_expatwrap {
38
39		sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
40	534k	{
41	534k	if( ! m_in.is() ) {
42	0	throw NotConnectedException();
43	0	}
44	534k	if( ! m_bStarted ) {
45		// it should be possible to find the encoding attribute
46		// within the first 512 bytes == 128 chars in UCS-4
47	261k	nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
48	261k	}
49
50	534k	sal_Int32 nRead;
51	534k	Sequence< sal_Int8 > seqStart;
52	534k	while( true )
53	534k	{
54	534k	nRead = m_in->readSomeBytes( seq , nMaxToRead );
55
56	534k	if( nRead + seqStart.getLength())
57	286k	{
58		// if nRead is 0, the file is already eof.
59	286k	if( ! m_bStarted && nRead )
60	255k	{
61		// ensure that enough data is available to parse encoding
62	255k	if( seqStart.hasElements() )
63	101	{
64		// prefix with what we had so far.
65	101	sal_Int32 nLength = seq.getLength();
66	101	seq.realloc( seqStart.getLength() + nLength );
67
68	101	memmove (seq.getArray() + seqStart.getLength(),
69	101	seq.getConstArray(),
70	101	nLength);
71	101	memcpy (seq.getArray(),
72	101	seqStart.getConstArray(),
73	101	seqStart.getLength());
74	101	}
75
76		// autodetection with the first bytes
77	255k	if( ! isEncodingRecognizable( seq ) )
78	325	{
79		// remember what we have so far.
80	325	seqStart = seq;
81
82		// read more !
83	325	continue;
84	325	}
85	254k	if( scanForEncoding( seq ) \|\| !m_sEncoding.isEmpty() ) {
86		// initialize decoding
87	209k	initializeDecoding();
88	209k	}
89	254k	seqStart = Sequence < sal_Int8 > ();
90	254k	}
91
92		// do the encoding
93	286k	if( m_pText2Unicode && m_pUnicode2Text &&
94	41.4k	m_pText2Unicode->canContinue() ) {
95
96	39.4k	Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
97	39.4k	seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
98	39.4k	}
99
100	286k	if( ! m_bStarted )
101	255k	{
102		// it must now be ensured, that no encoding attribute exist anymore
103		// ( otherwise the expat-Parser will crash )
104		// This must be done after decoding !
105		// ( e.g. Files decoded in ucs-4 cannot be read properly )
106	255k	m_bStarted = true;
107	255k	removeEncoding( seq );
108	255k	}
109	286k	nRead = seq.getLength();
110	286k	}
111
112	534k	break;
113	534k	}
114	534k	return nRead;
115	534k	}
116
117		void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
118	255k	{
119	255k	const sal_Int8 *pSource = seq.getArray();
120	255k	if (seq.getLength() < 5 \|\| strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
121	55.9k	return;
122
123		// scan for encoding
124	199k	OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
125
126		// cut sequence to first line break
127		// find first line break;
128	199k	int nMax = str.indexOf( 10 );
129	199k	if( nMax >= 0 )
130	192k	{
131	192k	str = str.copy( 0 , nMax );
132	192k	}
133
134	199k	int nFound = str.indexOf( " encoding" );
135	199k	if( nFound < 0 ) return;
136
137	194k	int nStop;
138	194k	int nStart = str.indexOf( "\"" , nFound );
139	194k	if( nStart < 0 \|\| str.indexOf( "'" , nFound ) < nStart )
140	168k	{
141	168k	nStart = str.indexOf( "'" , nFound );
142	168k	nStop = str.indexOf( "'" , nStart +1 );
143	168k	}
144	26.2k	else
145	26.2k	{
146	26.2k	nStop = str.indexOf( "\"" , nStart +1);
147	26.2k	}
148
149	194k	if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
150	28.6k	{
151		// remove encoding tag from file
152	28.6k	memmove( &( seq.getArray()[nFound] ) ,
153	28.6k	&( seq.getArray()[nStop+1]) ,
154	28.6k	seq.getLength() - nStop -1);
155	28.6k	seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
156	28.6k	}
157	194k	}
158
159		// Checks, if enough data has been accumulated to recognize the encoding
160		bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
161	255k	{
162	255k	const sal_Int8 *pSource = seq.getConstArray();
163	255k	bool bCheckIfFirstClosingBracketExists = false;
164
165	255k	if( seq.getLength() < 8 ) {
166		// no recognition possible, when less than 8 bytes are available
167	156	return false;
168	156	}
169
170	255k	if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
171		// scan if the <?xml tag finishes within this buffer
172	195k	bCheckIfFirstClosingBracketExists = true;
173	195k	}
174	59.8k	else if( ('<' == pSource[0] \|\| '<' == pSource[2] ) &&
175	41.2k	('?' == pSource[4] \|\| '?' == pSource[6] ) )
176	535	{
177		// check for utf-16
178	535	bCheckIfFirstClosingBracketExists = true;
179	535	}
180	59.3k	else if( ( '<' == pSource[1] \|\| '<' == pSource[3] ) &&
181	16.2k	( '?' == pSource[5] \|\| '?' == pSource[7] ) )
182	114	{
183		// check for
184	114	bCheckIfFirstClosingBracketExists = true;
185	114	}
186
187	255k	if( bCheckIfFirstClosingBracketExists )
188	195k	{
189		// whole <?xml tag is valid
190	195k	return std::find(seq.begin(), seq.end(), '>') != seq.end();
191	195k	}
192
193		// No <? tag in front, no need for a bigger buffer
194	59.2k	return true;
195	255k	}
196
197		bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
198	254k	{
199	254k	const sal_uInt8 pSource = reinterpret_cast<const sal_uInt8>( seq.getConstArray() );
200	254k	bool bReturn = true;
201
202	254k	if( seq.getLength() < 4 ) {
203		// no recognition possible, when less than 4 bytes are available
204	0	return false;
205	0	}
206
207		// first level : detect possible file formats
208	254k	if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
209		// scan for encoding
210	195k	OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
211
212		// cut sequence to first line break
213		//find first line break;
214	195k	int nMax = str.indexOf( 10 );
215	195k	if( nMax >= 0 )
216	191k	{
217	191k	str = str.copy( 0 , nMax );
218	191k	}
219
220	195k	int nFound = str.indexOf( " encoding" );
221	195k	if( nFound >= 0 ) {
222	190k	int nStop;
223	190k	int nStart = str.indexOf( "\"" , nFound );
224	190k	if( nStart < 0 \|\| str.indexOf( "'" , nFound ) < nStart )
225	164k	{
226	164k	nStart = str.indexOf( "'" , nFound );
227	164k	nStop = str.indexOf( "'" , nStart +1 );
228	164k	}
229	26.1k	else
230	26.1k	{
231	26.1k	nStop = str.indexOf( "\"" , nStart +1);
232	26.1k	}
233	190k	if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
234	28.6k	{
235		// encoding found finally
236	28.6k	m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
237	28.6k	}
238	190k	}
239	195k	}
240	59.7k	else if( 0xFE == pSource[0] &&
241	544	0xFF == pSource[1] ) {
242		// UTF-16 big endian
243		// conversion is done so that encoding information can be easily extracted
244	535	m_sEncoding = "utf-16"_ostr;
245	535	}
246	59.2k	else if( 0xFF == pSource[0] &&
247	285	0xFE == pSource[1] ) {
248		// UTF-16 little endian
249		// conversion is done so that encoding information can be easily extracted
250	275	m_sEncoding = "utf-16"_ostr;
251	275	}
252	58.9k	else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
253		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
254		// The byte order mark is simply added
255
256		// simply add the byte order mark !
257	210	seq.realloc( seq.getLength() + 2 );
258	210	memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
259	210	reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
260	210	reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
261
262	210	m_sEncoding = "utf-16"_ostr;
263	210	}
264	58.7k	else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
265		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
266		// The byte order mark is simply added
267
268	363	seq.realloc( seq.getLength() + 2 );
269	363	memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
270	363	reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
271	363	reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
272
273	363	m_sEncoding = "utf-16"_ostr;
274	363	}
275	58.4k	else if( 0xEF == pSource[0] &&
276	12.5k	0xBB == pSource[1] &&
277	12.5k	0xBF == pSource[2] )
278	12.5k	{
279		// UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
280		// The BOM is removed.
281	12.5k	memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
282	12.5k	seq.realloc( seq.getLength() - 3 );
283	12.5k	m_sEncoding = "utf-8"_ostr;
284	12.5k	}
285	45.8k	else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
286		// UCS-4 big endian
287	6	m_sEncoding = "ucs-4"_ostr;
288	6	}
289	45.8k	else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
290		// UCS-4 little endian
291	9	m_sEncoding = "ucs-4"_ostr;
292	9	}
293		/* TODO: no need to test for the moment since we return sal_False like default case anyway
294		else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
295		0xa7 == static_cast<unsigned char> (pSource[2]) &&
296		0x94 == static_cast<unsigned char> (pSource[3]) ) {
297		// EBCDIC
298		bReturn = sal_False; // must be extended
299		}
300		*/
301	45.8k	else {
302		// other
303		// UTF8 is directly recognized by the parser.
304	45.8k	bReturn = false;
305	45.8k	}
306
307	254k	return bReturn;
308	254k	}
309
310		void XMLFile2UTFConverter::initializeDecoding()
311	209k	{
312
313	209k	if( !m_sEncoding.isEmpty() )
314	42.6k	{
315	42.6k	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
316	42.6k	if( encoding != RTL_TEXTENCODING_UTF8 )
317	29.9k	{
318	29.9k	m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
319	29.9k	m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
320	29.9k	}
321	42.6k	}
322	209k	}
323
324
325		// Text2UnicodeConverter
326
327
328		Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
329	29.9k	: m_convText2Unicode(nullptr)
330	29.9k	, m_contextText2Unicode(nullptr)
331	29.9k	{
332	29.9k	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
333	29.9k	if( RTL_TEXTENCODING_DONTKNOW == encoding )
334	1.72k	{
335	1.72k	m_bCanContinue = false;
336	1.72k	m_bInitialized = false;
337	1.72k	}
338	28.2k	else
339	28.2k	{
340	28.2k	init( encoding );
341	28.2k	}
342	29.9k	}
343
344		Text2UnicodeConverter::~Text2UnicodeConverter()
345	29.9k	{
346	29.9k	if( m_bInitialized )
347	28.2k	{
348	28.2k	rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
349	28.2k	rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
350	28.2k	}
351	29.9k	}
352
353		void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
354	28.2k	{
355	28.2k	m_bCanContinue = true;
356	28.2k	m_bInitialized = true;
357
358	28.2k	m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
359	28.2k	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
360	28.2k	}
361
362
363		Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
364	39.4k	{
365	39.4k	sal_uInt32 uiInfo;
366	39.4k	sal_Size nSrcCvtBytes = 0;
367	39.4k	sal_Size nTargetCount = 0;
368	39.4k	sal_Size nSourceCount = 0;
369
370		// the whole source size
371	39.4k	sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
372	39.4k	Sequence<sal_Unicode> seqUnicode ( nSourceSize );
373
374	39.4k	const sal_Int8 *pbSource = seqText.getConstArray();
375	39.4k	std::unique_ptr<sal_Int8[]> pbTempMem;
376
377	39.4k	if( m_seqSource.hasElements() ) {
378		// put old rest and new byte sequence into one array
379	25	pbTempMem.reset(new sal_Int8[ nSourceSize ]);
380	25	memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
381	25	memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
382	25	pbSource = pbTempMem.get();
383
384		// set to zero again
385	25	m_seqSource = Sequence< sal_Int8 >();
386	25	}
387
388	39.4k	while( true ) {
389
390		/* All invalid characters are transformed to the unicode undefined char */
391	39.4k	nTargetCount += rtl_convertTextToUnicode(
392	39.4k	m_convText2Unicode,
393	39.4k	m_contextText2Unicode,
394	39.4k	reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
395	39.4k	nSourceSize - nSourceCount ,
396	39.4k	&( seqUnicode.getArray()[ nTargetCount ] ),
397	39.4k	seqUnicode.getLength() - nTargetCount,
398	39.4k	RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT \|
399	39.4k	RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT \|
400	39.4k	RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
401	39.4k	&uiInfo,
402	39.4k	&nSrcCvtBytes );
403	39.4k	nSourceCount += nSrcCvtBytes;
404
405	39.4k	if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
406		// save necessary bytes for next conversion
407	0	seqUnicode.realloc( seqUnicode.getLength() * 2 );
408	0	continue;
409	0	}
410	39.4k	break;
411	39.4k	}
412	39.4k	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
413	840	m_seqSource.realloc( nSourceSize - nSourceCount );
414	840	memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
415	840	}
416
417		// set to correct unicode size
418	39.4k	seqUnicode.realloc( nTargetCount );
419
420	39.4k	return seqUnicode;
421	39.4k	}
422
423
424		// Unicode2TextConverter
425
426
427		Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
428	29.9k	{
429	29.9k	m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
430	29.9k	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
431	29.9k	}
432
433
434		Unicode2TextConverter::~Unicode2TextConverter()
435	29.9k	{
436	29.9k	rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
437	29.9k	rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
438	29.9k	}
439
440
441		Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
442	39.4k	{
443	39.4k	std::unique_ptr<sal_Unicode[]> puTempMem;
444
445	39.4k	if( m_seqSource.hasElements() ) {
446		// For surrogates !
447		// put old rest and new byte sequence into one array
448		// In general when surrogates are used, they should be rarely
449		// cut off between two convert()-calls. So this code is used
450		// rarely and the extra copy is acceptable.
451	0	puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
452	0	memcpy( puTempMem.get() ,
453	0	m_seqSource.getConstArray() ,
454	0	m_seqSource.getLength() * sizeof( sal_Unicode ) );
455	0	memcpy(
456	0	&(puTempMem[ m_seqSource.getLength() ]) ,
457	0	puSource ,
458	0	nSourceSize*sizeof( sal_Unicode ) );
459	0	puSource = puTempMem.get();
460	0	nSourceSize += m_seqSource.getLength();
461
462	0	m_seqSource = Sequence< sal_Unicode > ();
463	0	}
464
465
466	39.4k	sal_Size nTargetCount = 0;
467	39.4k	sal_Size nSourceCount = 0;
468
469	39.4k	sal_uInt32 uiInfo;
470	39.4k	sal_Size nSrcCvtChars;
471
472		// take nSourceSize * 3 as preference
473		// this is an upper boundary for converting to utf8,
474		// which most often used as the target.
475	39.4k	sal_Int32 nSeqSize = nSourceSize * 3;
476
477	39.4k	Sequence<sal_Int8> seqText( nSeqSize );
478	39.4k	char pTarget = reinterpret_cast<char >(seqText.getArray());
479	39.4k	while( true ) {
480
481	39.4k	nTargetCount += rtl_convertUnicodeToText(
482	39.4k	m_convUnicode2Text,
483	39.4k	m_contextUnicode2Text,
484	39.4k	&( puSource[nSourceCount] ),
485	39.4k	nSourceSize - nSourceCount ,
486	39.4k	&( pTarget[nTargetCount] ),
487	39.4k	nSeqSize - nTargetCount,
488	39.4k	RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT \|
489	39.4k	RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
490	39.4k	&uiInfo,
491	39.4k	&nSrcCvtChars);
492	39.4k	nSourceCount += nSrcCvtChars;
493
494	39.4k	if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
495	0	nSeqSize = nSeqSize *2;
496	0	seqText.realloc( nSeqSize ); // double array size
497	0	pTarget = reinterpret_cast<char *>(seqText.getArray());
498	0	continue;
499	0	}
500	39.4k	break;
501	39.4k	}
502
503		// for surrogates
504	39.4k	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
505	0	m_seqSource.realloc( nSourceSize - nSourceCount );
506	0	memcpy( m_seqSource.getArray() ,
507	0	&(puSource[nSourceCount]),
508	0	(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
509	0	}
510
511		// reduce the size of the buffer (fast, no copy necessary)
512	39.4k	seqText.realloc( nTargetCount );
513
514	39.4k	return seqText;
515	39.4k	}
516
517		}
518
519		/* vim:set shiftwidth=4 softtabstop=4 expandtab: */