/src/libreoffice/svtools/source/svrtf/parrtf.cxx

Source
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include <sal/config.h>
#include <sal/log.hxx>

#include <comphelper/scopeguard.hxx>

#include <o3tl/numeric.hxx>
#include <rtl/character.hxx>
#include <rtl/strbuf.hxx>
#include <rtl/tencinfo.h>
#include <rtl/ustrbuf.hxx>
#include <tools/stream.hxx>
#include <tools/debug.hxx>
#include <svtools/rtftoken.h>
#include <svtools/parrtf.hxx>

const int MAX_STRING_LEN = 1024;

#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)

SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
    : SvParser<int>( rIn, nStackSize )
    , nOpenBrackets(0)
    , nUPRLevel(0)
    , eCodeSet(RTL_TEXTENCODING_MS_1252)
    , nUCharOverread(1)
{
    // default is ANSI-CodeSet
    SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
    bRTF_InTextRead = false;
}

SvRTFParser::~SvRTFParser()
{
}


int SvRTFParser::GetNextToken_()
{
    int nRet = 0;
    do {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                // control characters
                nNextCh = GetNextChar();
                switch( nNextCh )
                {
                case '{':
                case '}':
                case '\\':
                case '+':       // I found it in a RTF-file
                case '~':       // nonbreaking space
                case '-':       // optional hyphen
                case '_':       // nonbreaking hyphen
                case '\'':      // HexValue
                    nNextCh = '\\';
                    rInput.SeekRel( -1 );
                    ScanText();
                    nRet = RTF_TEXTTOKEN;
                    bNextCh = 0 == nNextCh;
                    break;

                case '*':       // ignoreflag
                    nRet = RTF_IGNOREFLAG;
                    break;
                case ':':       // subentry in an index entry
                    nRet = RTF_SUBENTRYINDEX;
                    break;
                case '|':       // formula-character
                    nRet = RTF_FORMULA;
                    break;

                case 0x0a:
                case 0x0d:
                    nRet = RTF_PAR;
                    break;

                default:
                    if( RTF_ISALPHA( nNextCh ) )
                    {
                        aToken = "\\";
                        {
                            do {
                                aToken.appendUtf32(nNextCh);
                                nNextCh = GetNextChar();
                            } while( RTF_ISALPHA( nNextCh ) );
                        }

                        // minus before numeric parameters
                        bool bNegValue = false;
                        if( '-' == nNextCh )
                        {
                            bNegValue = true;
                            nNextCh = GetNextChar();
                        }

                        // possible numeric parameter
                        if( RTF_ISDIGIT( nNextCh ) )
                        {
                            OUStringBuffer aNumber;
                            do {
                                aNumber.append(static_cast<sal_Unicode>(nNextCh));
                                nNextCh = GetNextChar();
                            } while( RTF_ISDIGIT( nNextCh ) );
                            nTokenValue = OUString::unacquired(aNumber).toInt32();
                            if( bNegValue )
                                nTokenValue = -nTokenValue;
                            bTokenHasValue=true;
                        }
                        else if( bNegValue )        // restore minus
                        {
                            nNextCh = '-';
                            rInput.SeekRel( -1 );
                        }
                        if( ' ' == nNextCh )        // blank is part of token!
                            nNextCh = GetNextChar();

                        // search for the token in the table:
                        if( 0 == (nRet = GetRTFToken( aToken )) )
                            // Unknown Control
                            nRet = RTF_UNKNOWNCONTROL;

                        // bug 76812 - unicode token handled as normal text
                        bNextCh = false;
                        switch( nRet )
                        {
                        case RTF_UC:
                            if( 0 <= nTokenValue )
                            {
                                nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
                                if (!aParserStates.empty())
                                {
                                    //cmc: other ifdef breaks #i3584
                                    aParserStates.top().nUCharOverread = nUCharOverread;
                                }
                            }
                            aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
                            // read next token
                            nRet = 0;
                            break;

                        case RTF_UPR:
                            if (!_inSkipGroup)
                            {
                                if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
                                {
                                    SAL_WARN("svtools", "urp stack too deep");
                                    eState = SvParserState::Error;
                                    break;
                                }

                                ++nUPRLevel;

                                // UPR - overread the group with the ansi
                                //       information
                                int nNextToken;
                                do
                                {
                                    nNextToken = GetNextToken_();
                                }
                                while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());

                                SkipGroup();
                                GetNextToken_();  // overread the last bracket
                                nRet = 0;

                                --nUPRLevel;
                            }
                            break;

                        case RTF_U:
                            if( !bRTF_InTextRead )
                            {
                                nRet = RTF_TEXTTOKEN;
                                aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );

                                // overread the next n "RTF" characters. This
                                // can be also \{, \}, \'88
                                for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                                {
                                    sal_uInt32 cAnsi = nNextCh;
                                    while( 0xD == cAnsi )
                                        cAnsi = GetNextChar();
                                    while( 0xA == cAnsi )
                                        cAnsi = GetNextChar();

                                    if( '\\' == cAnsi &&
                                        '\'' == GetNextChar() )
                                        // skip HexValue
                                        GetHexValue();
                                    nNextCh = GetNextChar();
                                }
                                ScanText();
                                bNextCh = 0 == nNextCh;
                            }
                            break;
                        }
                    }
                    else if( SvParserState::Pending != eState )
                    {
                        // Bug 34631 - "\ " read on - Blank as character
                        // eState = SvParserState::Error;
                        bNextCh = false;
                    }
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Accepted;
            nRet = nNextCh;
            break;

        case '{':
            {
                if( 0 <= nOpenBrackets )
                {
                    RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
                    aParserStates.push( aState );
                }
                ++nOpenBrackets;
                DBG_ASSERT(
                    static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                    "ParserStateStack unequal to bracket count" );
                nRet = nNextCh;
            }
            break;

        case '}':
            --nOpenBrackets;
            if( 0 <= nOpenBrackets )
            {
                aParserStates.pop();
                if( !aParserStates.empty() )
                {
                    const RtfParserState_Impl& rRPS =
                            aParserStates.top();
                    nUCharOverread = rRPS.nUCharOverread;
                    SetSrcEncoding( rRPS.eCodeSet );
                }
                else
                {
                    nUCharOverread = 1;
                    SetSrcEncoding( GetCodeSet() );
                }
            }
            DBG_ASSERT(
                static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                "ParserStateStack unequal to bracket count" );
            nRet = nNextCh;
            break;

        case 0x0d:
        case 0x0a:
            break;

        default:
            // now normal text follows
            ScanText();
            nRet = RTF_TEXTTOKEN;
            bNextCh = 0 == nNextCh;
            break;
        }

        if( bNextCh )
            nNextCh = GetNextChar();

    } while( !nRet && SvParserState::Working == eState );
    return nRet;
}


sal_Unicode SvRTFParser::GetHexValue()
{
    // collect Hex values
    sal_uInt32 nHi = GetNextChar();
    sal_uInt32 nLo = GetNextChar();
    nNextCh = nLo;
    return o3tl::convertToHex<sal_Unicode, 0>(nHi, nLo);
}

void SvRTFParser::ScanText()
{
    const sal_Unicode cBreak = 0;
    OUStringBuffer aStrBuffer;
    bool bContinue = true;
    while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
    {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                nNextCh = GetNextChar();
                switch (nNextCh)
                {
                case '\'':
                    {

                        OStringBuffer aByteString;
                        while (true)
                        {
                            char c = static_cast<char>(GetHexValue());
                            /*
                             * Note: \'00 is a valid internal character in  a
                             * string in RTF. OStringBuffer supports
                             * appending nulls fine
                             */
                            aByteString.append(c);

                            bool bBreak = false;
                            bool bEOF = false;
                            char nSlash = '\\';
                            while (!bBreak)
                            {
                                auto next = GetNextChar();
                                if (sal_Unicode(EOF) == next)
                                {
                                    bEOF = true;
                                    break;
                                }
                                if (next>0xFF) // fix for #i43933# and #i35653#
                                {
                                    if (!aByteString.isEmpty())
                                    {
                                        aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
                                        aByteString.setLength(0);
                                    }
                                    aStrBuffer.append(static_cast<sal_Unicode>(next));

                                    continue;
                                }
                                nSlash = static_cast<char>(next);
                                while (nSlash == 0xD || nSlash == 0xA)
                                    nSlash = static_cast<char>(GetNextChar());

                                switch (nSlash)
                                {
                                    case '{':
                                    case '}':
                                    case '\\':
                                        bBreak = true;
                                        break;
                                    default:
                                        aByteString.append(nSlash);
                                        break;
                                }
                            }

                            if (bEOF)
                            {
                                bContinue = false;        // abort, string together
                                break;
                            }

                            nNextCh = GetNextChar();

                            if (nSlash != '\\' || nNextCh != '\'')
                            {
                                rInput.SeekRel(-1);
                                nNextCh = static_cast<unsigned char>(nSlash);
                                break;
                            }
                        }

                        bNextCh = false;

                        if (!aByteString.isEmpty())
                        {
                            aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
                            aByteString.setLength(0);
                        }
                    }
                    break;
                case '\\':
                case '}':
                case '{':
                case '+':       // I found in a RTF file
                    aStrBuffer.append(sal_Unicode(nNextCh));
                    break;
                case '~':       // nonbreaking space
                    aStrBuffer.append(u'\x00A0');
                    break;
                case '-':       // optional hyphen
                    aStrBuffer.append(u'\x00AD');
                    break;
                case '_':       // nonbreaking hyphen
                    aStrBuffer.append(u'\x2011');
                    break;

                case 'u':
                    // read UNI-Code characters
                    {
                        nNextCh = GetNextChar();
                        rInput.SeekRel( -2 );

                        if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
                        {
                            bRTF_InTextRead = true;

                            OUString sSave( aToken ); // GetNextToken_() overwrites this
                            nNextCh = '\\';
                            int nToken = GetNextToken_();
                            DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
                            // don't convert symbol chars
                            aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));

                            // overread the next n "RTF" characters. This
                            // can be also \{, \}, \'88
                            for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                            {
                                sal_Unicode cAnsi = nNextCh;
                                while( 0xD == cAnsi )
                                    cAnsi = GetNextChar();
                                while( 0xA == cAnsi )
                                    cAnsi = GetNextChar();

                                if( '\\' == cAnsi &&
                                    '\'' == GetNextChar() )
                                    // skip HexValue
                                    GetHexValue();
                                nNextCh = GetNextChar();
                            }
                            bNextCh = false;
                            aToken = sSave;
                            bRTF_InTextRead = false;
                        }
                        else if ( 'c' == nNextCh )
                        {
                            // Prevent text breaking into multiple tokens.
                            rInput.SeekRel( 2 );
                            nNextCh = GetNextChar();
                            if (RTF_ISDIGIT( nNextCh ))
                            {
                                sal_uInt8 nNewOverread = 0 ;
                                do {
                                    nNewOverread *= 10;
                                    nNewOverread += nNextCh - '0';
                                    nNextCh = GetNextChar();
                                } while ( RTF_ISDIGIT( nNextCh ) );
                                nUCharOverread = nNewOverread;
                                if (!aParserStates.empty())
                                    aParserStates.top().nUCharOverread = nNewOverread;
                            }
                            bNextCh = 0x20 == nNextCh;
                        }
                        else
                        {
                            nNextCh = '\\';
                            bContinue = false;        // abort, string together
                        }
                    }
                    break;

                default:
                    rInput.SeekRel( -1 );
                    nNextCh = '\\';
                    bContinue = false;        // abort, string together
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Error;
            [[fallthrough]];
        case '{':
        case '}':
            bContinue = false;
            break;

        case 0x0a:
        case 0x0d:
            break;

        default:
            if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
                bContinue = false;
            else
            {
                do {
                    // all other characters end up in the text
                    aStrBuffer.appendUtf32(nNextCh);

                    if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
                    {
                        if (!aStrBuffer.isEmpty())
                            aToken.append( aStrBuffer );
                        return;
                    }
                } while
                (
                    (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
                    (aStrBuffer.getLength() < MAX_STRING_LEN)
                );
                bNextCh = false;
            }
        }

        if( bContinue && bNextCh )
            nNextCh = GetNextChar();
    }

    if (!aStrBuffer.isEmpty())
        aToken.append( aStrBuffer );
}


short SvRTFParser::_inSkipGroup=0;

void SvRTFParser::SkipGroup()
{
    short nBrackets=1;
    if (_inSkipGroup>0)
        return;
    _inSkipGroup++;
//#i16185# faking \bin keyword
    do
    {
        switch (nNextCh)
        {
            case '{':
                ++nBrackets;
                break;
            case '}':
                if (!--nBrackets) {
                    _inSkipGroup--;
                    return;
                }
                break;
        }
        int nToken = GetNextToken_();
        if (nToken == RTF_BIN)
        {
            rInput.SeekRel(-1);
            SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
            if (nTokenValue > 0)
                rInput.SeekRel(nTokenValue);
            nNextCh = GetNextChar();
        }
        while (nNextCh==0xa || nNextCh==0xd)
        {
            nNextCh = GetNextChar();
        }
    } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());

    if( SvParserState::Pending != eState && '}' != nNextCh )
        eState = SvParserState::Error;
    _inSkipGroup--;
}

void SvRTFParser::ReadUnknownData() { SkipGroup(); }
void SvRTFParser::ReadBitmapData()  { SkipGroup(); }


SvParserState SvRTFParser::CallParser()
{
    char cFirstCh(0);
    nNextChPos = rInput.Tell();
    rInput.ReadChar( cFirstCh );
    nNextCh = static_cast<unsigned char>(cFirstCh);
    eState = SvParserState::Working;
    nOpenBrackets = 0;
    eCodeSet = RTL_TEXTENCODING_MS_1252;
    SetSrcEncoding( eCodeSet );

    // the first two tokens should be '{' and \\rtf !!
    if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
    {
        AddFirstRef();
        // call ReleaseRef at end of this scope, even in the face of exceptions
        comphelper::ScopeGuard g([this] {
            if( SvParserState::Pending != eState )
                ReleaseRef();       // now parser is not needed anymore
        });
        Continue( 0 );
    }
    else
        eState = SvParserState::Error;

    return eState;
}

void SvRTFParser::Continue( int nToken )
{
//  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
//              "Characterset was changed." );

    if( !nToken )
        nToken = GetNextToken();

    bool bLooping = false;

    while (IsParserWorking() && !bLooping)
    {
        auto nCurrentTokenIndex = m_nTokenIndex;
        auto nCurrentToken = nToken;

        SaveState( nToken );
        switch( nToken )
        {
        case '}':
            if( nOpenBrackets )
                goto NEXTTOKEN;
            eState = SvParserState::Accepted;
            break;

        case '{':
            // an unknown group ?
            {
                if( RTF_IGNOREFLAG != GetNextToken() )
                    nToken = SkipToken();
                else if( RTF_UNKNOWNCONTROL != GetNextToken() )
                    nToken = SkipToken( -2 );
                else
                {
                    // filter immediately
                    ReadUnknownData();
                    nToken = GetNextToken();
                    if( '}' != nToken )
                        eState = SvParserState::Error;
                    break;      // move to next token!!
                }
            }
            goto NEXTTOKEN;

        case RTF_UNKNOWNCONTROL:
            break;      // skip unknown token
        case RTF_NEXTTYPE:
        case RTF_ANSITYPE:
            eCodeSet = RTL_TEXTENCODING_MS_1252;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_MACTYPE:
            eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCTYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_437;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCATYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_850;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_ANSICPG:
            eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
            SetSrcEncoding(eCodeSet);
            break;
        default:
NEXTTOKEN:
            NextToken( nToken );
            break;
        }
        if( IsParserWorking() )
            SaveState( 0 );         // processed till here,
                                    // continue with new token!
        nToken = GetNextToken();
        bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
    }
    if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
        eState = SvParserState::Error;
}

void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
{
    if (eEnc == RTL_TEXTENCODING_DONTKNOW)
        eEnc = GetCodeSet();

    if (!aParserStates.empty())
        aParserStates.top().eCodeSet = eEnc;
    SetSrcEncoding(eEnc);
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Coverage Report

Created: 2026-05-16 09:25

Line	Count	Source
1		/* -- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- */
2		/*
3		* This file is part of the LibreOffice project.
4		*
5		* This Source Code Form is subject to the terms of the Mozilla Public
6		* License, v. 2.0. If a copy of the MPL was not distributed with this
7		* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8		*
9		* This file incorporates work covered by the following license notice:
10		*
11		* Licensed to the Apache Software Foundation (ASF) under one or more
12		* contributor license agreements. See the NOTICE file distributed
13		* with this work for additional information regarding copyright
14		* ownership. The ASF licenses this file to you under the Apache
15		* License, Version 2.0 (the "License"); you may not use this file
16		* except in compliance with the License. You may obtain a copy of
17		* the License at http://www.apache.org/licenses/LICENSE-2.0 .
18		*/
19
20		#include <sal/config.h>
21		#include <sal/log.hxx>
22
23		#include <comphelper/scopeguard.hxx>
24
25		#include <o3tl/numeric.hxx>
26		#include <rtl/character.hxx>
27		#include <rtl/strbuf.hxx>
28		#include <rtl/tencinfo.h>
29		#include <rtl/ustrbuf.hxx>
30		#include <tools/stream.hxx>
31		#include <tools/debug.hxx>
32		#include <svtools/rtftoken.h>
33		#include <svtools/parrtf.hxx>
34
35		const int MAX_STRING_LEN = 1024;
36
37	6.74M	#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
38	18.4M	#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
39
40		SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
41	11.5k	: SvParser<int>( rIn, nStackSize )
42	11.5k	, nOpenBrackets(0)
43	11.5k	, nUPRLevel(0)
44	11.5k	, eCodeSet(RTL_TEXTENCODING_MS_1252)
45	11.5k	, nUCharOverread(1)
46	11.5k	{
47		// default is ANSI-CodeSet
48	11.5k	SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
49	11.5k	bRTF_InTextRead = false;
50	11.5k	}
51
52		SvRTFParser::~SvRTFParser()
53	11.5k	{
54	11.5k	}
55
56
57		int SvRTFParser::GetNextToken_()
58	2.95M	{
59	2.95M	int nRet = 0;
60	3.14M	do {
61	3.14M	bool bNextCh = true;
62	3.14M	switch( nNextCh )
63	3.14M	{
64	1.39M	case '\\':
65	1.39M	{
66		// control characters
67	1.39M	nNextCh = GetNextChar();
68	1.39M	switch( nNextCh )
69	1.39M	{
70	5.53k	case '{':
71	6.33k	case '}':
72	21.2k	case '\\':
73	21.4k	case '+': // I found it in a RTF-file
74	22.4k	case '~': // nonbreaking space
75	23.1k	case '-': // optional hyphen
76	23.3k	case '_': // nonbreaking hyphen
77	34.3k	case '\'': // HexValue
78	34.3k	nNextCh = '\\';
79	34.3k	rInput.SeekRel( -1 );
80	34.3k	ScanText();
81	34.3k	nRet = RTF_TEXTTOKEN;
82	34.3k	bNextCh = 0 == nNextCh;
83	34.3k	break;
84
85	15.1k	case '*': // ignoreflag
86	15.1k	nRet = RTF_IGNOREFLAG;
87	15.1k	break;
88	3.24k	case ':': // subentry in an index entry
89	3.24k	nRet = RTF_SUBENTRYINDEX;
90	3.24k	break;
91	830	case '\|': // formula-character
92	830	nRet = RTF_FORMULA;
93	830	break;
94
95	374k	case 0x0a:
96	386k	case 0x0d:
97	386k	nRet = RTF_PAR;
98	386k	break;
99
100	951k	default:
101	951k	if( RTF_ISALPHA( nNextCh ) )
102	848k	{
103	848k	aToken = "\\";
104	848k	{
105	3.07M	do {
106	3.07M	aToken.appendUtf32(nNextCh);
107	3.07M	nNextCh = GetNextChar();
108	3.07M	} while( RTF_ISALPHA( nNextCh ) );
109	848k	}
110
111		// minus before numeric parameters
112	848k	bool bNegValue = false;
113	848k	if( '-' == nNextCh )
114	26.6k	{
115	26.6k	bNegValue = true;
116	26.6k	nNextCh = GetNextChar();
117	26.6k	}
118
119		// possible numeric parameter
120	848k	if( RTF_ISDIGIT( nNextCh ) )
121	323k	{
122	323k	OUStringBuffer aNumber;
123	716k	do {
124	716k	aNumber.append(static_cast<sal_Unicode>(nNextCh));
125	716k	nNextCh = GetNextChar();
126	716k	} while( RTF_ISDIGIT( nNextCh ) );
127	323k	nTokenValue = OUString::unacquired(aNumber).toInt32();
128	323k	if( bNegValue )
129	13.6k	nTokenValue = -nTokenValue;
130	323k	bTokenHasValue=true;
131	323k	}
132	525k	else if( bNegValue ) // restore minus
133	12.9k	{
134	12.9k	nNextCh = '-';
135	12.9k	rInput.SeekRel( -1 );
136	12.9k	}
137	848k	if( ' ' == nNextCh ) // blank is part of token!
138	89.8k	nNextCh = GetNextChar();
139
140		// search for the token in the table:
141	848k	if( 0 == (nRet = GetRTFToken( aToken )) )
142		// Unknown Control
143	165k	nRet = RTF_UNKNOWNCONTROL;
144
145		// bug 76812 - unicode token handled as normal text
146	848k	bNextCh = false;
147	848k	switch( nRet )
148	848k	{
149	4.73k	case RTF_UC:
150	4.73k	if( 0 <= nTokenValue )
151	3.48k	{
152	3.48k	nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
153	3.48k	if (!aParserStates.empty())
154	3.25k	{
155		//cmc: other ifdef breaks #i3584
156	3.25k	aParserStates.top().nUCharOverread = nUCharOverread;
157	3.25k	}
158	3.48k	}
159	4.73k	aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
160		// read next token
161	4.73k	nRet = 0;
162	4.73k	break;
163
164	5.25k	case RTF_UPR:
165	5.25k	if (!_inSkipGroup)
166	4.31k	{
167	4.31k	if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
168	71	{
169	71	SAL_WARN("svtools", "urp stack too deep");
170	71	eState = SvParserState::Error;
171	71	break;
172	71	}
173
174	4.24k	++nUPRLevel;
175
176		// UPR - overread the group with the ansi
177		// information
178	4.24k	int nNextToken;
179	4.24k	do
180	6.76k	{
181	6.76k	nNextToken = GetNextToken_();
182	6.76k	}
183	6.76k	while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
184
185	4.24k	SkipGroup();
186	4.24k	GetNextToken_(); // overread the last bracket
187	4.24k	nRet = 0;
188
189	4.24k	--nUPRLevel;
190	4.24k	}
191	5.18k	break;
192
193	20.3k	case RTF_U:
194	20.3k	if( !bRTF_InTextRead )
195	8.64k	{
196	8.64k	nRet = RTF_TEXTTOKEN;
197	8.64k	aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
198
199		// overread the next n "RTF" characters. This
200		// can be also \{, \}, \'88
201	19.6k	for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
202	10.9k	{
203	10.9k	sal_uInt32 cAnsi = nNextCh;
204	11.7k	while( 0xD == cAnsi )
205	784	cAnsi = GetNextChar();
206	13.4k	while( 0xA == cAnsi )
207	2.47k	cAnsi = GetNextChar();
208
209	10.9k	if( '\\' == cAnsi &&
210	2.60k	'\'' == GetNextChar() )
211		// skip HexValue
212	206	GetHexValue();
213	10.9k	nNextCh = GetNextChar();
214	10.9k	}
215	8.64k	ScanText();
216	8.64k	bNextCh = 0 == nNextCh;
217	8.64k	}
218	20.3k	break;
219	848k	}
220	848k	}
221	102k	else if( SvParserState::Pending != eState )
222	102k	{
223		// Bug 34631 - "\ " read on - Blank as character
224		// eState = SvParserState::Error;
225	102k	bNextCh = false;
226	102k	}
227	951k	break;
228	1.39M	}
229	1.39M	}
230	1.39M	break;
231
232	1.39M	case sal_Unicode(EOF):
233	13.9k	eState = SvParserState::Accepted;
234	13.9k	nRet = nNextCh;
235	13.9k	break;
236
237	199k	case '{':
238	199k	{
239	199k	if( 0 <= nOpenBrackets )
240	197k	{
241	197k	RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
242	197k	aParserStates.push( aState );
243	197k	}
244	199k	++nOpenBrackets;
245	199k	DBG_ASSERT(
246	199k	static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
247	199k	"ParserStateStack unequal to bracket count" );
248	199k	nRet = nNextCh;
249	199k	}
250	199k	break;
251
252	96.2k	case '}':
253	96.2k	--nOpenBrackets;
254	96.2k	if( 0 <= nOpenBrackets )
255	87.0k	{
256	87.0k	aParserStates.pop();
257	87.0k	if( !aParserStates.empty() )
258	86.3k	{
259	86.3k	const RtfParserState_Impl& rRPS =
260	86.3k	aParserStates.top();
261	86.3k	nUCharOverread = rRPS.nUCharOverread;
262	86.3k	SetSrcEncoding( rRPS.eCodeSet );
263	86.3k	}
264	697	else
265	697	{
266	697	nUCharOverread = 1;
267	697	SetSrcEncoding( GetCodeSet() );
268	697	}
269	87.0k	}
270	96.2k	DBG_ASSERT(
271	96.2k	static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
272	96.2k	"ParserStateStack unequal to bracket count" );
273	96.2k	nRet = nNextCh;
274	96.2k	break;
275
276	20.3k	case 0x0d:
277	82.1k	case 0x0a:
278	82.1k	break;
279
280	1.35M	default:
281		// now normal text follows
282	1.35M	ScanText();
283	1.35M	nRet = RTF_TEXTTOKEN;
284	1.35M	bNextCh = 0 == nNextCh;
285	1.35M	break;
286	3.14M	}
287
288	3.14M	if( bNextCh )
289	1.50M	nNextCh = GetNextChar();
290
291	3.14M	} while( !nRet && SvParserState::Working == eState );
292	2.95M	return nRet;
293	2.95M	}
294
295
296		sal_Unicode SvRTFParser::GetHexValue()
297	40.6k	{
298		// collect Hex values
299	40.6k	sal_uInt32 nHi = GetNextChar();
300	40.6k	sal_uInt32 nLo = GetNextChar();
301	40.6k	nNextCh = nLo;
302	40.6k	return o3tl::convertToHex<sal_Unicode, 0>(nHi, nLo);
303	40.6k	}
304
305		void SvRTFParser::ScanText()
306	1.40M	{
307	1.40M	const sal_Unicode cBreak = 0;
308	1.40M	OUStringBuffer aStrBuffer;
309	1.40M	bool bContinue = true;
310	6.66M	while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
311	5.26M	{
312	5.26M	bool bNextCh = true;
313	5.26M	switch( nNextCh )
314	5.26M	{
315	668k	case '\\':
316	668k	{
317	668k	nNextCh = GetNextChar();
318	668k	switch (nNextCh)
319	668k	{
320	17.5k	case '\'':
321	17.5k	{
322
323	17.5k	OStringBuffer aByteString;
324	39.3k	while (true)
325	39.3k	{
326	39.3k	char c = static_cast<char>(GetHexValue());
327		/*
328		* Note: \'00 is a valid internal character in a
329		* string in RTF. OStringBuffer supports
330		* appending nulls fine
331		*/
332	39.3k	aByteString.append(c);
333
334	39.3k	bool bBreak = false;
335	39.3k	bool bEOF = false;
336	39.3k	char nSlash = '\\';
337	311k	while (!bBreak)
338	272k	{
339	272k	auto next = GetNextChar();
340	272k	if (sal_Unicode(EOF) == next)
341	577	{
342	577	bEOF = true;
343	577	break;
344	577	}
345	271k	if (next>0xFF) // fix for #i43933# and #i35653#
346	4.53k	{
347	4.53k	if (!aByteString.isEmpty())
348	2.91k	{
349	2.91k	aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
350	2.91k	aByteString.setLength(0);
351	2.91k	}
352	4.53k	aStrBuffer.append(static_cast<sal_Unicode>(next));
353
354	4.53k	continue;
355	4.53k	}
356	267k	nSlash = static_cast<char>(next);
357	269k	while (nSlash == 0xD \|\| nSlash == 0xA)
358	2.57k	nSlash = static_cast<char>(GetNextChar());
359
360	267k	switch (nSlash)
361	267k	{
362	1.28k	case '{':
363	3.26k	case '}':
364	38.7k	case '\\':
365	38.7k	bBreak = true;
366	38.7k	break;
367	228k	default:
368	228k	aByteString.append(nSlash);
369	228k	break;
370	267k	}
371	267k	}
372
373	39.3k	if (bEOF)
374	577	{
375	577	bContinue = false; // abort, string together
376	577	break;
377	577	}
378
379	38.7k	nNextCh = GetNextChar();
380
381	38.7k	if (nSlash != '\\' \|\| nNextCh != '\'')
382	16.9k	{
383	16.9k	rInput.SeekRel(-1);
384	16.9k	nNextCh = static_cast<unsigned char>(nSlash);
385	16.9k	break;
386	16.9k	}
387	38.7k	}
388
389	17.5k	bNextCh = false;
390
391	17.5k	if (!aByteString.isEmpty())
392	16.5k	{
393	16.5k	aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
394	16.5k	aByteString.setLength(0);
395	16.5k	}
396	17.5k	}
397	0	break;
398	45.9k	case '\\':
399	62.0k	case '}':
400	86.1k	case '{':
401	86.9k	case '+': // I found in a RTF file
402	86.9k	aStrBuffer.append(sal_Unicode(nNextCh));
403	86.9k	break;
404	704	case '~': // nonbreaking space
405	704	aStrBuffer.append(u'\x00A0');
406	704	break;
407	2.09k	case '-': // optional hyphen
408	2.09k	aStrBuffer.append(u'\x00AD');
409	2.09k	break;
410	458	case '_': // nonbreaking hyphen
411	458	aStrBuffer.append(u'\x2011');
412	458	break;
413
414	26.6k	case 'u':
415		// read UNI-Code characters
416	26.6k	{
417	26.6k	nNextCh = GetNextChar();
418	26.6k	rInput.SeekRel( -2 );
419
420	26.6k	if( '-' == nNextCh \|\| RTF_ISDIGIT( nNextCh ) )
421	11.7k	{
422	11.7k	bRTF_InTextRead = true;
423
424	11.7k	OUString sSave( aToken ); // GetNextToken_() overwrites this
425	11.7k	nNextCh = '\\';
426	11.7k	int nToken = GetNextToken_();
427	11.7k	DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
428		// don't convert symbol chars
429	11.7k	aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
430
431		// overread the next n "RTF" characters. This
432		// can be also \{, \}, \'88
433	22.4k	for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
434	10.7k	{
435	10.7k	sal_Unicode cAnsi = nNextCh;
436	10.9k	while( 0xD == cAnsi )
437	232	cAnsi = GetNextChar();
438	12.0k	while( 0xA == cAnsi )
439	1.28k	cAnsi = GetNextChar();
440
441	10.7k	if( '\\' == cAnsi &&
442	2.48k	'\'' == GetNextChar() )
443		// skip HexValue
444	1.12k	GetHexValue();
445	10.7k	nNextCh = GetNextChar();
446	10.7k	}
447	11.7k	bNextCh = false;
448	11.7k	aToken = sSave;
449	11.7k	bRTF_InTextRead = false;
450	11.7k	}
451	14.8k	else if ( 'c' == nNextCh )
452	2.98k	{
453		// Prevent text breaking into multiple tokens.
454	2.98k	rInput.SeekRel( 2 );
455	2.98k	nNextCh = GetNextChar();
456	2.98k	if (RTF_ISDIGIT( nNextCh ))
457	1.90k	{
458	1.90k	sal_uInt8 nNewOverread = 0 ;
459	2.97k	do {
460	2.97k	nNewOverread *= 10;
461	2.97k	nNewOverread += nNextCh - '0';
462	2.97k	nNextCh = GetNextChar();
463	2.97k	} while ( RTF_ISDIGIT( nNextCh ) );
464	1.90k	nUCharOverread = nNewOverread;
465	1.90k	if (!aParserStates.empty())
466	1.77k	aParserStates.top().nUCharOverread = nNewOverread;
467	1.90k	}
468	2.98k	bNextCh = 0x20 == nNextCh;
469	2.98k	}
470	11.8k	else
471	11.8k	{
472	11.8k	nNextCh = '\\';
473	11.8k	bContinue = false; // abort, string together
474	11.8k	}
475	26.6k	}
476	26.6k	break;
477
478	534k	default:
479	534k	rInput.SeekRel( -1 );
480	534k	nNextCh = '\\';
481	534k	bContinue = false; // abort, string together
482	534k	break;
483	668k	}
484	668k	}
485	668k	break;
486
487	668k	case sal_Unicode(EOF):
488	759	eState = SvParserState::Error;
489	759	[[fallthrough]];
490	68.3k	case '{':
491	134k	case '}':
492	134k	bContinue = false;
493	134k	break;
494
495	84.4k	case 0x0a:
496	94.1k	case 0x0d:
497	94.1k	break;
498
499	4.36M	default:
500	4.36M	if( nNextCh == cBreak \|\| aStrBuffer.getLength() >= MAX_STRING_LEN)
501	710k	bContinue = false;
502	3.65M	else
503	3.65M	{
504	7.21M	do {
505		// all other characters end up in the text
506	7.21M	aStrBuffer.appendUtf32(nNextCh);
507
508	7.21M	if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
509	6.44k	{
510	6.44k	if (!aStrBuffer.isEmpty())
511	6.44k	aToken.append( aStrBuffer );
512	6.44k	return;
513	6.44k	}
514	7.21M	} while
515	3.65M	(
516	7.21M	(RTF_ISALPHA(nNextCh) \|\| RTF_ISDIGIT(nNextCh)) &&
517	3.56M	(aStrBuffer.getLength() < MAX_STRING_LEN)
518	3.65M	);
519	3.65M	bNextCh = false;
520	3.65M	}
521	5.26M	}
522
523	5.25M	if( bContinue && bNextCh )
524	184k	nNextCh = GetNextChar();
525	5.25M	}
526
527	1.39M	if (!aStrBuffer.isEmpty())
528	825k	aToken.append( aStrBuffer );
529	1.39M	}
530
531
532		short SvRTFParser::_inSkipGroup=0;
533
534		void SvRTFParser::SkipGroup()
535	9.40k	{
536	9.40k	short nBrackets=1;
537	9.40k	if (_inSkipGroup>0)
538	0	return;
539	9.40k	_inSkipGroup++;
540		//#i16185# faking \bin keyword
541	9.40k	do
542	60.1k	{
543	60.1k	switch (nNextCh)
544	60.1k	{
545	6.30k	case '{':
546	6.30k	++nBrackets;
547	6.30k	break;
548	10.1k	case '}':
549	10.1k	if (!--nBrackets) {
550	5.38k	_inSkipGroup--;
551	5.38k	return;
552	5.38k	}
553	4.78k	break;
554	60.1k	}
555	54.7k	int nToken = GetNextToken_();
556	54.7k	if (nToken == RTF_BIN)
557	652	{
558	652	rInput.SeekRel(-1);
559	652	SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
560	652	if (nTokenValue > 0)
561	238	rInput.SeekRel(nTokenValue);
562	652	nNextCh = GetNextChar();
563	652	}
564	56.6k	while (nNextCh==0xa \|\| nNextCh==0xd)
565	1.89k	{
566	1.89k	nNextCh = GetNextChar();
567	1.89k	}
568	54.7k	} while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
569
570	4.02k	if( SvParserState::Pending != eState && '}' != nNextCh )
571	3.71k	eState = SvParserState::Error;
572	4.02k	_inSkipGroup--;
573	4.02k	}
574
575	2.59k	void SvRTFParser::ReadUnknownData() { SkipGroup(); }
576	28	void SvRTFParser::ReadBitmapData() { SkipGroup(); }
577
578
579		SvParserState SvRTFParser::CallParser()
580	11.5k	{
581	11.5k	char cFirstCh(0);
582	11.5k	nNextChPos = rInput.Tell();
583	11.5k	rInput.ReadChar( cFirstCh );
584	11.5k	nNextCh = static_cast<unsigned char>(cFirstCh);
585	11.5k	eState = SvParserState::Working;
586	11.5k	nOpenBrackets = 0;
587	11.5k	eCodeSet = RTL_TEXTENCODING_MS_1252;
588	11.5k	SetSrcEncoding( eCodeSet );
589
590		// the first two tokens should be '{' and \\rtf !!
591	11.5k	if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
592	11.3k	{
593	11.3k	AddFirstRef();
594		// call ReleaseRef at end of this scope, even in the face of exceptions
595	11.3k	comphelper::ScopeGuard g([this] {
596	11.3k	if( SvParserState::Pending != eState )
597	11.3k	ReleaseRef(); // now parser is not needed anymore
598	11.3k	});
599	11.3k	Continue( 0 );
600	11.3k	}
601	154	else
602	154	eState = SvParserState::Error;
603
604	11.5k	return eState;
605	11.5k	}
606
607		void SvRTFParser::Continue( int nToken )
608	11.3k	{
609		// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
610		// "Characterset was changed." );
611
612	11.3k	if( !nToken )
613	11.3k	nToken = GetNextToken();
614
615	11.3k	bool bLooping = false;
616
617	2.37M	while (IsParserWorking() && !bLooping)
618	2.35M	{
619	2.35M	auto nCurrentTokenIndex = m_nTokenIndex;
620	2.35M	auto nCurrentToken = nToken;
621
622	2.35M	SaveState( nToken );
623	2.35M	switch( nToken )
624	2.35M	{
625	61.1k	case '}':
626	61.1k	if( nOpenBrackets )
627	60.6k	goto NEXTTOKEN;
628	486	eState = SvParserState::Accepted;
629	486	break;
630
631	122k	case '{':
632		// an unknown group ?
633	122k	{
634	122k	if( RTF_IGNOREFLAG != GetNextToken() )
635	118k	nToken = SkipToken();
636	4.15k	else if( RTF_UNKNOWNCONTROL != GetNextToken() )
637	3.07k	nToken = SkipToken( -2 );
638	1.07k	else
639	1.07k	{
640		// filter immediately
641	1.07k	ReadUnknownData();
642	1.07k	nToken = GetNextToken();
643	1.07k	if( '}' != nToken )
644	46	eState = SvParserState::Error;
645	1.07k	break; // move to next token!!
646	1.07k	}
647	122k	}
648	121k	goto NEXTTOKEN;
649
650	144k	case RTF_UNKNOWNCONTROL:
651	144k	break; // skip unknown token
652	0	case RTF_NEXTTYPE:
653	1.86k	case RTF_ANSITYPE:
654	1.86k	eCodeSet = RTL_TEXTENCODING_MS_1252;
655	1.86k	SetSrcEncoding( eCodeSet );
656	1.86k	break;
657	704	case RTF_MACTYPE:
658	704	eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
659	704	SetSrcEncoding( eCodeSet );
660	704	break;
661	1.00k	case RTF_PCTYPE:
662	1.00k	eCodeSet = RTL_TEXTENCODING_IBM_437;
663	1.00k	SetSrcEncoding( eCodeSet );
664	1.00k	break;
665	137	case RTF_PCATYPE:
666	137	eCodeSet = RTL_TEXTENCODING_IBM_850;
667	137	SetSrcEncoding( eCodeSet );
668	137	break;
669	9.77k	case RTF_ANSICPG:
670	9.77k	eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
671	9.77k	SetSrcEncoding(eCodeSet);
672	9.77k	break;
673	2.01M	default:
674	2.19M	NEXTTOKEN:
675	2.19M	NextToken( nToken );
676	2.19M	break;
677	2.35M	}
678	2.35M	if( IsParserWorking() )
679	2.35M	SaveState( 0 ); // processed till here,
680		// continue with new token!
681	2.35M	nToken = GetNextToken();
682	2.35M	bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
683	2.35M	}
684	11.3k	if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
685	9.82k	eState = SvParserState::Error;
686	11.3k	}
687
688		void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
689	60.2k	{
690	60.2k	if (eEnc == RTL_TEXTENCODING_DONTKNOW)
691	39.7k	eEnc = GetCodeSet();
692
693	60.2k	if (!aParserStates.empty())
694	59.6k	aParserStates.top().eCodeSet = eEnc;
695	60.2k	SetSrcEncoding(eEnc);
696	60.2k	}
697
698		/* vim:set shiftwidth=4 softtabstop=4 expandtab: */