/src/libreoffice/svtools/source/svrtf/parrtf.cxx

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include <sal/config.h>
#include <sal/log.hxx>

#include <comphelper/scopeguard.hxx>

#include <rtl/character.hxx>
#include <rtl/strbuf.hxx>
#include <rtl/tencinfo.h>
#include <rtl/ustrbuf.hxx>
#include <tools/stream.hxx>
#include <tools/debug.hxx>
#include <svtools/rtftoken.h>
#include <svtools/parrtf.hxx>

const int MAX_STRING_LEN = 1024;

#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)

SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
    : SvParser<int>( rIn, nStackSize )
    , nOpenBrackets(0)
    , nUPRLevel(0)
    , eCodeSet(RTL_TEXTENCODING_MS_1252)
    , nUCharOverread(1)
{
    // default is ANSI-CodeSet
    SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
    bRTF_InTextRead = false;
}

SvRTFParser::~SvRTFParser()
{
}


int SvRTFParser::GetNextToken_()
{
    int nRet = 0;
    do {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                // control characters
                nNextCh = GetNextChar();
                switch( nNextCh )
                {
                case '{':
                case '}':
                case '\\':
                case '+':       // I found it in a RTF-file
                case '~':       // nonbreaking space
                case '-':       // optional hyphen
                case '_':       // nonbreaking hyphen
                case '\'':      // HexValue
                    nNextCh = '\\';
                    rInput.SeekRel( -1 );
                    ScanText();
                    nRet = RTF_TEXTTOKEN;
                    bNextCh = 0 == nNextCh;
                    break;

                case '*':       // ignoreflag
                    nRet = RTF_IGNOREFLAG;
                    break;
                case ':':       // subentry in an index entry
                    nRet = RTF_SUBENTRYINDEX;
                    break;
                case '|':       // formula-character
                    nRet = RTF_FORMULA;
                    break;

                case 0x0a:
                case 0x0d:
                    nRet = RTF_PAR;
                    break;

                default:
                    if( RTF_ISALPHA( nNextCh ) )
                    {
                        aToken = "\\";
                        {
                            do {
                                aToken.appendUtf32(nNextCh);
                                nNextCh = GetNextChar();
                            } while( RTF_ISALPHA( nNextCh ) );
                        }

                        // minus before numeric parameters
                        bool bNegValue = false;
                        if( '-' == nNextCh )
                        {
                            bNegValue = true;
                            nNextCh = GetNextChar();
                        }

                        // possible numeric parameter
                        if( RTF_ISDIGIT( nNextCh ) )
                        {
                            OUStringBuffer aNumber;
                            do {
                                aNumber.append(static_cast<sal_Unicode>(nNextCh));
                                nNextCh = GetNextChar();
                            } while( RTF_ISDIGIT( nNextCh ) );
                            nTokenValue = OUString::unacquired(aNumber).toInt32();
                            if( bNegValue )
                                nTokenValue = -nTokenValue;
                            bTokenHasValue=true;
                        }
                        else if( bNegValue )        // restore minus
                        {
                            nNextCh = '-';
                            rInput.SeekRel( -1 );
                        }
                        if( ' ' == nNextCh )        // blank is part of token!
                            nNextCh = GetNextChar();

                        // search for the token in the table:
                        if( 0 == (nRet = GetRTFToken( aToken )) )
                            // Unknown Control
                            nRet = RTF_UNKNOWNCONTROL;

                        // bug 76812 - unicode token handled as normal text
                        bNextCh = false;
                        switch( nRet )
                        {
                        case RTF_UC:
                            if( 0 <= nTokenValue )
                            {
                                nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
                                if (!aParserStates.empty())
                                {
                                    //cmc: other ifdef breaks #i3584
                                    aParserStates.top().nUCharOverread = nUCharOverread;
                                }
                            }
                            aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
                            // read next token
                            nRet = 0;
                            break;

                        case RTF_UPR:
                            if (!_inSkipGroup)
                            {
                                if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
                                {
                                    SAL_WARN("svtools", "urp stack too deep");
                                    eState = SvParserState::Error;
                                    break;
                                }

                                ++nUPRLevel;

                                // UPR - overread the group with the ansi
                                //       information
                                int nNextToken;
                                do
                                {
                                    nNextToken = GetNextToken_();
                                }
                                while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());

                                SkipGroup();
                                GetNextToken_();  // overread the last bracket
                                nRet = 0;

                                --nUPRLevel;
                            }
                            break;

                        case RTF_U:
                            if( !bRTF_InTextRead )
                            {
                                nRet = RTF_TEXTTOKEN;
                                aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );

                                // overread the next n "RTF" characters. This
                                // can be also \{, \}, \'88
                                for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                                {
                                    sal_uInt32 cAnsi = nNextCh;
                                    while( 0xD == cAnsi )
                                        cAnsi = GetNextChar();
                                    while( 0xA == cAnsi )
                                        cAnsi = GetNextChar();

                                    if( '\\' == cAnsi &&
                                        '\'' == GetNextChar() )
                                        // skip HexValue
                                        GetHexValue();
                                    nNextCh = GetNextChar();
                                }
                                ScanText();
                                bNextCh = 0 == nNextCh;
                            }
                            break;
                        }
                    }
                    else if( SvParserState::Pending != eState )
                    {
                        // Bug 34631 - "\ " read on - Blank as character
                        // eState = SvParserState::Error;
                        bNextCh = false;
                    }
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Accepted;
            nRet = nNextCh;
            break;

        case '{':
            {
                if( 0 <= nOpenBrackets )
                {
                    RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
                    aParserStates.push( aState );
                }
                ++nOpenBrackets;
                DBG_ASSERT(
                    static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                    "ParserStateStack unequal to bracket count" );
                nRet = nNextCh;
            }
            break;

        case '}':
            --nOpenBrackets;
            if( 0 <= nOpenBrackets )
            {
                aParserStates.pop();
                if( !aParserStates.empty() )
                {
                    const RtfParserState_Impl& rRPS =
                            aParserStates.top();
                    nUCharOverread = rRPS.nUCharOverread;
                    SetSrcEncoding( rRPS.eCodeSet );
                }
                else
                {
                    nUCharOverread = 1;
                    SetSrcEncoding( GetCodeSet() );
                }
            }
            DBG_ASSERT(
                static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
                "ParserStateStack unequal to bracket count" );
            nRet = nNextCh;
            break;

        case 0x0d:
        case 0x0a:
            break;

        default:
            // now normal text follows
            ScanText();
            nRet = RTF_TEXTTOKEN;
            bNextCh = 0 == nNextCh;
            break;
        }

        if( bNextCh )
            nNextCh = GetNextChar();

    } while( !nRet && SvParserState::Working == eState );
    return nRet;
}


sal_Unicode SvRTFParser::GetHexValue()
{
    // collect Hex values
    int n;
    sal_Unicode nHexVal = 0;

    for( n = 0; n < 2; ++n )
    {
        nHexVal *= 16;
        nNextCh = GetNextChar();
        if( nNextCh >= '0' && nNextCh <= '9' )
            nHexVal += (nNextCh - 48);
        else if( nNextCh >= 'a' && nNextCh <= 'f' )
            nHexVal += (nNextCh - 87);
        else if( nNextCh >= 'A' && nNextCh <= 'F' )
            nHexVal += (nNextCh - 55);
    }
    return nHexVal;
}

void SvRTFParser::ScanText()
{
    const sal_Unicode cBreak = 0;
    OUStringBuffer aStrBuffer;
    bool bContinue = true;
    while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
    {
        bool bNextCh = true;
        switch( nNextCh )
        {
        case '\\':
            {
                nNextCh = GetNextChar();
                switch (nNextCh)
                {
                case '\'':
                    {

                        OStringBuffer aByteString;
                        while (true)
                        {
                            char c = static_cast<char>(GetHexValue());
                            /*
                             * Note: \'00 is a valid internal character in  a
                             * string in RTF. OStringBuffer supports
                             * appending nulls fine
                             */
                            aByteString.append(c);

                            bool bBreak = false;
                            bool bEOF = false;
                            char nSlash = '\\';
                            while (!bBreak)
                            {
                                auto next = GetNextChar();
                                if (sal_Unicode(EOF) == next)
                                {
                                    bEOF = true;
                                    break;
                                }
                                if (next>0xFF) // fix for #i43933# and #i35653#
                                {
                                    if (!aByteString.isEmpty())
                                    {
                                        aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
                                        aByteString.setLength(0);
                                    }
                                    aStrBuffer.append(static_cast<sal_Unicode>(next));

                                    continue;
                                }
                                nSlash = static_cast<char>(next);
                                while (nSlash == 0xD || nSlash == 0xA)
                                    nSlash = static_cast<char>(GetNextChar());

                                switch (nSlash)
                                {
                                    case '{':
                                    case '}':
                                    case '\\':
                                        bBreak = true;
                                        break;
                                    default:
                                        aByteString.append(nSlash);
                                        break;
                                }
                            }

                            if (bEOF)
                            {
                                bContinue = false;        // abort, string together
                                break;
                            }

                            nNextCh = GetNextChar();

                            if (nSlash != '\\' || nNextCh != '\'')
                            {
                                rInput.SeekRel(-1);
                                nNextCh = static_cast<unsigned char>(nSlash);
                                break;
                            }
                        }

                        bNextCh = false;

                        if (!aByteString.isEmpty())
                        {
                            aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
                            aByteString.setLength(0);
                        }
                    }
                    break;
                case '\\':
                case '}':
                case '{':
                case '+':       // I found in a RTF file
                    aStrBuffer.append(sal_Unicode(nNextCh));
                    break;
                case '~':       // nonbreaking space
                    aStrBuffer.append(u'\x00A0');
                    break;
                case '-':       // optional hyphen
                    aStrBuffer.append(u'\x00AD');
                    break;
                case '_':       // nonbreaking hyphen
                    aStrBuffer.append(u'\x2011');
                    break;

                case 'u':
                    // read UNI-Code characters
                    {
                        nNextCh = GetNextChar();
                        rInput.SeekRel( -2 );

                        if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
                        {
                            bRTF_InTextRead = true;

                            OUString sSave( aToken ); // GetNextToken_() overwrites this
                            nNextCh = '\\';
                            int nToken = GetNextToken_();
                            DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
                            // don't convert symbol chars
                            aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));

                            // overread the next n "RTF" characters. This
                            // can be also \{, \}, \'88
                            for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
                            {
                                sal_Unicode cAnsi = nNextCh;
                                while( 0xD == cAnsi )
                                    cAnsi = GetNextChar();
                                while( 0xA == cAnsi )
                                    cAnsi = GetNextChar();

                                if( '\\' == cAnsi &&
                                    '\'' == GetNextChar() )
                                    // skip HexValue
                                    GetHexValue();
                                nNextCh = GetNextChar();
                            }
                            bNextCh = false;
                            aToken = sSave;
                            bRTF_InTextRead = false;
                        }
                        else if ( 'c' == nNextCh )
                        {
                            // Prevent text breaking into multiple tokens.
                            rInput.SeekRel( 2 );
                            nNextCh = GetNextChar();
                            if (RTF_ISDIGIT( nNextCh ))
                            {
                                sal_uInt8 nNewOverread = 0 ;
                                do {
                                    nNewOverread *= 10;
                                    nNewOverread += nNextCh - '0';
                                    nNextCh = GetNextChar();
                                } while ( RTF_ISDIGIT( nNextCh ) );
                                nUCharOverread = nNewOverread;
                                if (!aParserStates.empty())
                                    aParserStates.top().nUCharOverread = nNewOverread;
                            }
                            bNextCh = 0x20 == nNextCh;
                        }
                        else
                        {
                            nNextCh = '\\';
                            bContinue = false;        // abort, string together
                        }
                    }
                    break;

                default:
                    rInput.SeekRel( -1 );
                    nNextCh = '\\';
                    bContinue = false;        // abort, string together
                    break;
                }
            }
            break;

        case sal_Unicode(EOF):
            eState = SvParserState::Error;
            [[fallthrough]];
        case '{':
        case '}':
            bContinue = false;
            break;

        case 0x0a:
        case 0x0d:
            break;

        default:
            if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
                bContinue = false;
            else
            {
                do {
                    // all other characters end up in the text
                    aStrBuffer.appendUtf32(nNextCh);

                    if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
                    {
                        if (!aStrBuffer.isEmpty())
                            aToken.append( aStrBuffer );
                        return;
                    }
                } while
                (
                    (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
                    (aStrBuffer.getLength() < MAX_STRING_LEN)
                );
                bNextCh = false;
            }
        }

        if( bContinue && bNextCh )
            nNextCh = GetNextChar();
    }

    if (!aStrBuffer.isEmpty())
        aToken.append( aStrBuffer );
}


short SvRTFParser::_inSkipGroup=0;

void SvRTFParser::SkipGroup()
{
    short nBrackets=1;
    if (_inSkipGroup>0)
        return;
    _inSkipGroup++;
//#i16185# faking \bin keyword
    do
    {
        switch (nNextCh)
        {
            case '{':
                ++nBrackets;
                break;
            case '}':
                if (!--nBrackets) {
                    _inSkipGroup--;
                    return;
                }
                break;
        }
        int nToken = GetNextToken_();
        if (nToken == RTF_BIN)
        {
            rInput.SeekRel(-1);
            SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
            if (nTokenValue > 0)
                rInput.SeekRel(nTokenValue);
            nNextCh = GetNextChar();
        }
        while (nNextCh==0xa || nNextCh==0xd)
        {
            nNextCh = GetNextChar();
        }
    } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());

    if( SvParserState::Pending != eState && '}' != nNextCh )
        eState = SvParserState::Error;
    _inSkipGroup--;
}

void SvRTFParser::ReadUnknownData() { SkipGroup(); }
void SvRTFParser::ReadBitmapData()  { SkipGroup(); }


SvParserState SvRTFParser::CallParser()
{
    char cFirstCh(0);
    nNextChPos = rInput.Tell();
    rInput.ReadChar( cFirstCh );
    nNextCh = static_cast<unsigned char>(cFirstCh);
    eState = SvParserState::Working;
    nOpenBrackets = 0;
    eCodeSet = RTL_TEXTENCODING_MS_1252;
    SetSrcEncoding( eCodeSet );

    // the first two tokens should be '{' and \\rtf !!
    if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
    {
        AddFirstRef();
        // call ReleaseRef at end of this scope, even in the face of exceptions
        comphelper::ScopeGuard g([this] {
            if( SvParserState::Pending != eState )
                ReleaseRef();       // now parser is not needed anymore
        });
        Continue( 0 );
    }
    else
        eState = SvParserState::Error;

    return eState;
}

void SvRTFParser::Continue( int nToken )
{
//  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
//              "Characterset was changed." );

    if( !nToken )
        nToken = GetNextToken();

    bool bLooping = false;

    while (IsParserWorking() && !bLooping)
    {
        auto nCurrentTokenIndex = m_nTokenIndex;
        auto nCurrentToken = nToken;

        SaveState( nToken );
        switch( nToken )
        {
        case '}':
            if( nOpenBrackets )
                goto NEXTTOKEN;
            eState = SvParserState::Accepted;
            break;

        case '{':
            // an unknown group ?
            {
                if( RTF_IGNOREFLAG != GetNextToken() )
                    nToken = SkipToken();
                else if( RTF_UNKNOWNCONTROL != GetNextToken() )
                    nToken = SkipToken( -2 );
                else
                {
                    // filter immediately
                    ReadUnknownData();
                    nToken = GetNextToken();
                    if( '}' != nToken )
                        eState = SvParserState::Error;
                    break;      // move to next token!!
                }
            }
            goto NEXTTOKEN;

        case RTF_UNKNOWNCONTROL:
            break;      // skip unknown token
        case RTF_NEXTTYPE:
        case RTF_ANSITYPE:
            eCodeSet = RTL_TEXTENCODING_MS_1252;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_MACTYPE:
            eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCTYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_437;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_PCATYPE:
            eCodeSet = RTL_TEXTENCODING_IBM_850;
            SetSrcEncoding( eCodeSet );
            break;
        case RTF_ANSICPG:
            eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
            SetSrcEncoding(eCodeSet);
            break;
        default:
NEXTTOKEN:
            NextToken( nToken );
            break;
        }
        if( IsParserWorking() )
            SaveState( 0 );         // processed till here,
                                    // continue with new token!
        nToken = GetNextToken();
        bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
    }
    if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
        eState = SvParserState::Error;
}

void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
{
    if (eEnc == RTL_TEXTENCODING_DONTKNOW)
        eEnc = GetCodeSet();

    if (!aParserStates.empty())
        aParserStates.top().eCodeSet = eEnc;
    SetSrcEncoding(eEnc);
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Coverage Report

Created: 2025-07-07 10:01

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- */
2		/*
3		* This file is part of the LibreOffice project.
4		*
5		* This Source Code Form is subject to the terms of the Mozilla Public
6		* License, v. 2.0. If a copy of the MPL was not distributed with this
7		* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8		*
9		* This file incorporates work covered by the following license notice:
10		*
11		* Licensed to the Apache Software Foundation (ASF) under one or more
12		* contributor license agreements. See the NOTICE file distributed
13		* with this work for additional information regarding copyright
14		* ownership. The ASF licenses this file to you under the Apache
15		* License, Version 2.0 (the "License"); you may not use this file
16		* except in compliance with the License. You may obtain a copy of
17		* the License at http://www.apache.org/licenses/LICENSE-2.0 .
18		*/
19
20		#include <sal/config.h>
21		#include <sal/log.hxx>
22
23		#include <comphelper/scopeguard.hxx>
24
25		#include <rtl/character.hxx>
26		#include <rtl/strbuf.hxx>
27		#include <rtl/tencinfo.h>
28		#include <rtl/ustrbuf.hxx>
29		#include <tools/stream.hxx>
30		#include <tools/debug.hxx>
31		#include <svtools/rtftoken.h>
32		#include <svtools/parrtf.hxx>
33
34		const int MAX_STRING_LEN = 1024;
35
36	4.92M	#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37	13.5M	#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
38
39		SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
40	10.4k	: SvParser<int>( rIn, nStackSize )
41	10.4k	, nOpenBrackets(0)
42	10.4k	, nUPRLevel(0)
43	10.4k	, eCodeSet(RTL_TEXTENCODING_MS_1252)
44	10.4k	, nUCharOverread(1)
45	10.4k	{
46		// default is ANSI-CodeSet
47	10.4k	SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
48	10.4k	bRTF_InTextRead = false;
49	10.4k	}
50
51		SvRTFParser::~SvRTFParser()
52	10.4k	{
53	10.4k	}
54
55
56		int SvRTFParser::GetNextToken_()
57	2.32M	{
58	2.32M	int nRet = 0;
59	2.46M	do {
60	2.46M	bool bNextCh = true;
61	2.46M	switch( nNextCh )
62	2.46M	{
63	1.10M	case '\\':
64	1.10M	{
65		// control characters
66	1.10M	nNextCh = GetNextChar();
67	1.10M	switch( nNextCh )
68	1.10M	{
69	3.53k	case '{':
70	3.94k	case '}':
71	15.6k	case '\\':
72	15.8k	case '+': // I found it in a RTF-file
73	16.8k	case '~': // nonbreaking space
74	17.4k	case '-': // optional hyphen
75	17.5k	case '_': // nonbreaking hyphen
76	22.9k	case '\'': // HexValue
77	22.9k	nNextCh = '\\';
78	22.9k	rInput.SeekRel( -1 );
79	22.9k	ScanText();
80	22.9k	nRet = RTF_TEXTTOKEN;
81	22.9k	bNextCh = 0 == nNextCh;
82	22.9k	break;
83
84	12.5k	case '*': // ignoreflag
85	12.5k	nRet = RTF_IGNOREFLAG;
86	12.5k	break;
87	1.82k	case ':': // subentry in an index entry
88	1.82k	nRet = RTF_SUBENTRYINDEX;
89	1.82k	break;
90	238	case '\|': // formula-character
91	238	nRet = RTF_FORMULA;
92	238	break;
93
94	324k	case 0x0a:
95	335k	case 0x0d:
96	335k	nRet = RTF_PAR;
97	335k	break;
98
99	727k	default:
100	727k	if( RTF_ISALPHA( nNextCh ) )
101	660k	{
102	660k	aToken = "\\";
103	660k	{
104	2.42M	do {
105	2.42M	aToken.appendUtf32(nNextCh);
106	2.42M	nNextCh = GetNextChar();
107	2.42M	} while( RTF_ISALPHA( nNextCh ) );
108	660k	}
109
110		// minus before numeric parameters
111	660k	bool bNegValue = false;
112	660k	if( '-' == nNextCh )
113	20.4k	{
114	20.4k	bNegValue = true;
115	20.4k	nNextCh = GetNextChar();
116	20.4k	}
117
118		// possible numeric parameter
119	660k	if( RTF_ISDIGIT( nNextCh ) )
120	266k	{
121	266k	OUStringBuffer aNumber;
122	578k	do {
123	578k	aNumber.append(static_cast<sal_Unicode>(nNextCh));
124	578k	nNextCh = GetNextChar();
125	578k	} while( RTF_ISDIGIT( nNextCh ) );
126	266k	nTokenValue = OUString::unacquired(aNumber).toInt32();
127	266k	if( bNegValue )
128	10.4k	nTokenValue = -nTokenValue;
129	266k	bTokenHasValue=true;
130	266k	}
131	393k	else if( bNegValue ) // restore minus
132	9.96k	{
133	9.96k	nNextCh = '-';
134	9.96k	rInput.SeekRel( -1 );
135	9.96k	}
136	660k	if( ' ' == nNextCh ) // blank is part of token!
137	78.5k	nNextCh = GetNextChar();
138
139		// search for the token in the table:
140	660k	if( 0 == (nRet = GetRTFToken( aToken )) )
141		// Unknown Control
142	100k	nRet = RTF_UNKNOWNCONTROL;
143
144		// bug 76812 - unicode token handled as normal text
145	660k	bNextCh = false;
146	660k	switch( nRet )
147	660k	{
148	2.29k	case RTF_UC:
149	2.29k	if( 0 <= nTokenValue )
150	1.42k	{
151	1.42k	nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
152	1.42k	if (!aParserStates.empty())
153	1.10k	{
154		//cmc: other ifdef breaks #i3584
155	1.10k	aParserStates.top().nUCharOverread = nUCharOverread;
156	1.10k	}
157	1.42k	}
158	2.29k	aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
159		// read next token
160	2.29k	nRet = 0;
161	2.29k	break;
162
163	4.56k	case RTF_UPR:
164	4.56k	if (!_inSkipGroup)
165	3.73k	{
166	3.73k	if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
167	25	{
168	25	SAL_WARN("svtools", "urp stack too deep");
169	25	eState = SvParserState::Error;
170	25	break;
171	25	}
172
173	3.71k	++nUPRLevel;
174
175		// UPR - overread the group with the ansi
176		// information
177	3.71k	int nNextToken;
178	3.71k	do
179	7.70k	{
180	7.70k	nNextToken = GetNextToken_();
181	7.70k	}
182	7.70k	while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
183
184	3.71k	SkipGroup();
185	3.71k	GetNextToken_(); // overread the last bracket
186	3.71k	nRet = 0;
187
188	3.71k	--nUPRLevel;
189	3.71k	}
190	4.53k	break;
191
192	10.2k	case RTF_U:
193	10.2k	if( !bRTF_InTextRead )
194	2.68k	{
195	2.68k	nRet = RTF_TEXTTOKEN;
196	2.68k	aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
197
198		// overread the next n "RTF" characters. This
199		// can be also \{, \}, \'88
200	10.2k	for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
201	7.55k	{
202	7.55k	sal_uInt32 cAnsi = nNextCh;
203	8.19k	while( 0xD == cAnsi )
204	643	cAnsi = GetNextChar();
205	9.98k	while( 0xA == cAnsi )
206	2.42k	cAnsi = GetNextChar();
207
208	7.55k	if( '\\' == cAnsi &&
209	7.55k	'\'' == GetNextChar() )
210		// skip HexValue
211	241	GetHexValue();
212	7.55k	nNextCh = GetNextChar();
213	7.55k	}
214	2.68k	ScanText();
215	2.68k	bNextCh = 0 == nNextCh;
216	2.68k	}
217	10.2k	break;
218	660k	}
219	660k	}
220	66.4k	else if( SvParserState::Pending != eState )
221	66.4k	{
222		// Bug 34631 - "\ " read on - Blank as character
223		// eState = SvParserState::Error;
224	66.4k	bNextCh = false;
225	66.4k	}
226	727k	break;
227	1.10M	}
228	1.10M	}
229	1.10M	break;
230
231	1.10M	case sal_Unicode(EOF):
232	12.7k	eState = SvParserState::Accepted;
233	12.7k	nRet = nNextCh;
234	12.7k	break;
235
236	179k	case '{':
237	179k	{
238	179k	if( 0 <= nOpenBrackets )
239	177k	{
240	177k	RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
241	177k	aParserStates.push( aState );
242	177k	}
243	179k	++nOpenBrackets;
244	179k	DBG_ASSERT(
245	179k	static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
246	179k	"ParserStateStack unequal to bracket count" );
247	179k	nRet = nNextCh;
248	179k	}
249	179k	break;
250
251	79.7k	case '}':
252	79.7k	--nOpenBrackets;
253	79.7k	if( 0 <= nOpenBrackets )
254	73.8k	{
255	73.8k	aParserStates.pop();
256	73.8k	if( !aParserStates.empty() )
257	73.3k	{
258	73.3k	const RtfParserState_Impl& rRPS =
259	73.3k	aParserStates.top();
260	73.3k	nUCharOverread = rRPS.nUCharOverread;
261	73.3k	SetSrcEncoding( rRPS.eCodeSet );
262	73.3k	}
263	500	else
264	500	{
265	500	nUCharOverread = 1;
266	500	SetSrcEncoding( GetCodeSet() );
267	500	}
268	73.8k	}
269	79.7k	DBG_ASSERT(
270	79.7k	static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
271	79.7k	"ParserStateStack unequal to bracket count" );
272	79.7k	nRet = nNextCh;
273	79.7k	break;
274
275	18.5k	case 0x0d:
276	66.6k	case 0x0a:
277	66.6k	break;
278
279	1.02M	default:
280		// now normal text follows
281	1.02M	ScanText();
282	1.02M	nRet = RTF_TEXTTOKEN;
283	1.02M	bNextCh = 0 == nNextCh;
284	1.02M	break;
285	2.46M	}
286
287	2.46M	if( bNextCh )
288	1.23M	nNextCh = GetNextChar();
289
290	2.46M	} while( !nRet && SvParserState::Working == eState );
291	2.32M	return nRet;
292	2.32M	}
293
294
295		sal_Unicode SvRTFParser::GetHexValue()
296	27.7k	{
297		// collect Hex values
298	27.7k	int n;
299	27.7k	sal_Unicode nHexVal = 0;
300
301	83.3k	for( n = 0; n < 2; ++n )
302	55.5k	{
303	55.5k	nHexVal *= 16;
304	55.5k	nNextCh = GetNextChar();
305	55.5k	if( nNextCh >= '0' && nNextCh <= '9' )
306	18.9k	nHexVal += (nNextCh - 48);
307	36.5k	else if( nNextCh >= 'a' && nNextCh <= 'f' )
308	24.0k	nHexVal += (nNextCh - 87);
309	12.5k	else if( nNextCh >= 'A' && nNextCh <= 'F' )
310	943	nHexVal += (nNextCh - 55);
311	55.5k	}
312	27.7k	return nHexVal;
313	27.7k	}
314
315		void SvRTFParser::ScanText()
316	1.05M	{
317	1.05M	const sal_Unicode cBreak = 0;
318	1.05M	OUStringBuffer aStrBuffer;
319	1.05M	bool bContinue = true;
320	4.69M	while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
321	3.65M	{
322	3.65M	bool bNextCh = true;
323	3.65M	switch( nNextCh )
324	3.65M	{
325	465k	case '\\':
326	465k	{
327	465k	nNextCh = GetNextChar();
328	465k	switch (nNextCh)
329	465k	{
330	9.12k	case '\'':
331	9.12k	{
332
333	9.12k	OStringBuffer aByteString;
334	26.4k	while (true)
335	26.4k	{
336	26.4k	char c = static_cast<char>(GetHexValue());
337		/*
338		* Note: \'00 is a valid internal character in a
339		* string in RTF. OStringBuffer supports
340		* appending nulls fine
341		*/
342	26.4k	aByteString.append(c);
343
344	26.4k	bool bBreak = false;
345	26.4k	bool bEOF = false;
346	26.4k	char nSlash = '\\';
347	215k	while (!bBreak)
348	189k	{
349	189k	auto next = GetNextChar();
350	189k	if (sal_Unicode(EOF) == next)
351	486	{
352	486	bEOF = true;
353	486	break;
354	486	}
355	189k	if (next>0xFF) // fix for #i43933# and #i35653#
356	3.59k	{
357	3.59k	if (!aByteString.isEmpty())
358	1.80k	{
359	1.80k	aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
360	1.80k	aByteString.setLength(0);
361	1.80k	}
362	3.59k	aStrBuffer.append(static_cast<sal_Unicode>(next));
363
364	3.59k	continue;
365	3.59k	}
366	185k	nSlash = static_cast<char>(next);
367	188k	while (nSlash == 0xD \|\| nSlash == 0xA)
368	2.09k	nSlash = static_cast<char>(GetNextChar());
369
370	185k	switch (nSlash)
371	185k	{
372	803	case '{':
373	2.58k	case '}':
374	25.9k	case '\\':
375	25.9k	bBreak = true;
376	25.9k	break;
377	159k	default:
378	159k	aByteString.append(nSlash);
379	159k	break;
380	185k	}
381	185k	}
382
383	26.4k	if (bEOF)
384	486	{
385	486	bContinue = false; // abort, string together
386	486	break;
387	486	}
388
389	25.9k	nNextCh = GetNextChar();
390
391	25.9k	if (nSlash != '\\' \|\| nNextCh != '\'')
392	8.63k	{
393	8.63k	rInput.SeekRel(-1);
394	8.63k	nNextCh = static_cast<unsigned char>(nSlash);
395	8.63k	break;
396	8.63k	}
397	25.9k	}
398
399	9.12k	bNextCh = false;
400
401	9.12k	if (!aByteString.isEmpty())
402	8.88k	{
403	8.88k	aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
404	8.88k	aByteString.setLength(0);
405	8.88k	}
406	9.12k	}
407	0	break;
408	34.8k	case '\\':
409	40.3k	case '}':
410	52.4k	case '{':
411	52.8k	case '+': // I found in a RTF file
412	52.8k	aStrBuffer.append(sal_Unicode(nNextCh));
413	52.8k	break;
414	758	case '~': // nonbreaking space
415	758	aStrBuffer.append(u'\x00A0');
416	758	break;
417	1.35k	case '-': // optional hyphen
418	1.35k	aStrBuffer.append(u'\x00AD');
419	1.35k	break;
420	341	case '_': // nonbreaking hyphen
421	341	aStrBuffer.append(u'\x2011');
422	341	break;
423
424	16.3k	case 'u':
425		// read UNI-Code characters
426	16.3k	{
427	16.3k	nNextCh = GetNextChar();
428	16.3k	rInput.SeekRel( -2 );
429
430	16.3k	if( '-' == nNextCh \|\| RTF_ISDIGIT( nNextCh ) )
431	7.52k	{
432	7.52k	bRTF_InTextRead = true;
433
434	7.52k	OUString sSave( aToken ); // GetNextToken_() overwrites this
435	7.52k	nNextCh = '\\';
436	7.52k	int nToken = GetNextToken_();
437	7.52k	DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
438		// don't convert symbol chars
439	7.52k	aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
440
441		// overread the next n "RTF" characters. This
442		// can be also \{, \}, \'88
443	14.8k	for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
444	7.35k	{
445	7.35k	sal_Unicode cAnsi = nNextCh;
446	7.82k	while( 0xD == cAnsi )
447	464	cAnsi = GetNextChar();
448	8.07k	while( 0xA == cAnsi )
449	717	cAnsi = GetNextChar();
450
451	7.35k	if( '\\' == cAnsi &&
452	7.35k	'\'' == GetNextChar() )
453		// skip HexValue
454	1.11k	GetHexValue();
455	7.35k	nNextCh = GetNextChar();
456	7.35k	}
457	7.52k	bNextCh = false;
458	7.52k	aToken = sSave;
459	7.52k	bRTF_InTextRead = false;
460	7.52k	}
461	8.79k	else if ( 'c' == nNextCh )
462	2.86k	{
463		// Prevent text breaking into multiple tokens.
464	2.86k	rInput.SeekRel( 2 );
465	2.86k	nNextCh = GetNextChar();
466	2.86k	if (RTF_ISDIGIT( nNextCh ))
467	2.42k	{
468	2.42k	sal_uInt8 nNewOverread = 0 ;
469	3.17k	do {
470	3.17k	nNewOverread *= 10;
471	3.17k	nNewOverread += nNextCh - '0';
472	3.17k	nNextCh = GetNextChar();
473	3.17k	} while ( RTF_ISDIGIT( nNextCh ) );
474	2.42k	nUCharOverread = nNewOverread;
475	2.42k	if (!aParserStates.empty())
476	2.29k	aParserStates.top().nUCharOverread = nNewOverread;
477	2.42k	}
478	2.86k	bNextCh = 0x20 == nNextCh;
479	2.86k	}
480	5.93k	else
481	5.93k	{
482	5.93k	nNextCh = '\\';
483	5.93k	bContinue = false; // abort, string together
484	5.93k	}
485	16.3k	}
486	16.3k	break;
487
488	384k	default:
489	384k	rInput.SeekRel( -1 );
490	384k	nNextCh = '\\';
491	384k	bContinue = false; // abort, string together
492	384k	break;
493	465k	}
494	465k	}
495	465k	break;
496
497	465k	case sal_Unicode(EOF):
498	742	eState = SvParserState::Error;
499	742	[[fallthrough]];
500	50.1k	case '{':
501	103k	case '}':
502	103k	bContinue = false;
503	103k	break;
504
505	55.3k	case 0x0a:
506	62.9k	case 0x0d:
507	62.9k	break;
508
509	3.02M	default:
510	3.02M	if( nNextCh == cBreak \|\| aStrBuffer.getLength() >= MAX_STRING_LEN)
511	547k	bContinue = false;
512	2.47M	else
513	2.47M	{
514	5.19M	do {
515		// all other characters end up in the text
516	5.19M	aStrBuffer.appendUtf32(nNextCh);
517
518	5.19M	if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
519	5.81k	{
520	5.81k	if (!aStrBuffer.isEmpty())
521	5.81k	aToken.append( aStrBuffer );
522	5.81k	return;
523	5.81k	}
524	5.19M	} while
525	2.47M	(
526	5.18M	(RTF_ISALPHA(nNextCh) \|\| RTF_ISDIGIT(nNextCh)) &&
527	5.18M	(aStrBuffer.getLength() < MAX_STRING_LEN)
528	2.47M	);
529	2.46M	bNextCh = false;
530	2.46M	}
531	3.65M	}
532
533	3.64M	if( bContinue && bNextCh )
534	118k	nNextCh = GetNextChar();
535	3.64M	}
536
537	1.04M	if (!aStrBuffer.isEmpty())
538	587k	aToken.append( aStrBuffer );
539	1.04M	}
540
541
542		short SvRTFParser::_inSkipGroup=0;
543
544		void SvRTFParser::SkipGroup()
545	9.18k	{
546	9.18k	short nBrackets=1;
547	9.18k	if (_inSkipGroup>0)
548	0	return;
549	9.18k	_inSkipGroup++;
550		//#i16185# faking \bin keyword
551	9.18k	do
552	87.9k	{
553	87.9k	switch (nNextCh)
554	87.9k	{
555	31.7k	case '{':
556	31.7k	++nBrackets;
557	31.7k	break;
558	10.4k	case '}':
559	10.4k	if (!--nBrackets) {
560	5.57k	_inSkipGroup--;
561	5.57k	return;
562	5.57k	}
563	4.86k	break;
564	87.9k	}
565	82.3k	int nToken = GetNextToken_();
566	82.3k	if (nToken == RTF_BIN)
567	409	{
568	409	rInput.SeekRel(-1);
569	409	SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
570	409	if (nTokenValue > 0)
571	236	rInput.SeekRel(nTokenValue);
572	409	nNextCh = GetNextChar();
573	409	}
574	84.6k	while (nNextCh==0xa \|\| nNextCh==0xd)
575	2.29k	{
576	2.29k	nNextCh = GetNextChar();
577	2.29k	}
578	82.3k	} while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
579
580	3.60k	if( SvParserState::Pending != eState && '}' != nNextCh )
581	3.36k	eState = SvParserState::Error;
582	3.60k	_inSkipGroup--;
583	3.60k	}
584
585	2.73k	void SvRTFParser::ReadUnknownData() { SkipGroup(); }
586	32	void SvRTFParser::ReadBitmapData() { SkipGroup(); }
587
588
589		SvParserState SvRTFParser::CallParser()
590	10.4k	{
591	10.4k	char cFirstCh(0);
592	10.4k	nNextChPos = rInput.Tell();
593	10.4k	rInput.ReadChar( cFirstCh );
594	10.4k	nNextCh = static_cast<unsigned char>(cFirstCh);
595	10.4k	eState = SvParserState::Working;
596	10.4k	nOpenBrackets = 0;
597	10.4k	eCodeSet = RTL_TEXTENCODING_MS_1252;
598	10.4k	SetSrcEncoding( eCodeSet );
599
600		// the first two tokens should be '{' and \\rtf !!
601	10.4k	if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
602	9.87k	{
603	9.87k	AddFirstRef();
604		// call ReleaseRef at end of this scope, even in the face of exceptions
605	9.87k	comphelper::ScopeGuard g([this] {
606	9.87k	if( SvParserState::Pending != eState )
607	9.87k	ReleaseRef(); // now parser is not needed anymore
608	9.87k	});
609	9.87k	Continue( 0 );
610	9.87k	}
611	576	else
612	576	eState = SvParserState::Error;
613
614	10.4k	return eState;
615	10.4k	}
616
617		void SvRTFParser::Continue( int nToken )
618	9.87k	{
619		// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
620		// "Characterset was changed." );
621
622	9.87k	if( !nToken )
623	9.87k	nToken = GetNextToken();
624
625	9.87k	bool bLooping = false;
626
627	1.81M	while (IsParserWorking() && !bLooping)
628	1.80M	{
629	1.80M	auto nCurrentTokenIndex = m_nTokenIndex;
630	1.80M	auto nCurrentToken = nToken;
631
632	1.80M	SaveState( nToken );
633	1.80M	switch( nToken )
634	1.80M	{
635	47.1k	case '}':
636	47.1k	if( nOpenBrackets )
637	46.7k	goto NEXTTOKEN;
638	369	eState = SvParserState::Accepted;
639	369	break;
640
641	96.3k	case '{':
642		// an unknown group ?
643	96.3k	{
644	96.3k	if( RTF_IGNOREFLAG != GetNextToken() )
645	92.8k	nToken = SkipToken();
646	3.52k	else if( RTF_UNKNOWNCONTROL != GetNextToken() )
647	2.28k	nToken = SkipToken( -2 );
648	1.23k	else
649	1.23k	{
650		// filter immediately
651	1.23k	ReadUnknownData();
652	1.23k	nToken = GetNextToken();
653	1.23k	if( '}' != nToken )
654	42	eState = SvParserState::Error;
655	1.23k	break; // move to next token!!
656	1.23k	}
657	96.3k	}
658	95.1k	goto NEXTTOKEN;
659
660	95.1k	case RTF_UNKNOWNCONTROL:
661	87.3k	break; // skip unknown token
662	0	case RTF_NEXTTYPE:
663	1.23k	case RTF_ANSITYPE:
664	1.23k	eCodeSet = RTL_TEXTENCODING_MS_1252;
665	1.23k	SetSrcEncoding( eCodeSet );
666	1.23k	break;
667	323	case RTF_MACTYPE:
668	323	eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
669	323	SetSrcEncoding( eCodeSet );
670	323	break;
671	403	case RTF_PCTYPE:
672	403	eCodeSet = RTL_TEXTENCODING_IBM_437;
673	403	SetSrcEncoding( eCodeSet );
674	403	break;
675	25	case RTF_PCATYPE:
676	25	eCodeSet = RTL_TEXTENCODING_IBM_850;
677	25	SetSrcEncoding( eCodeSet );
678	25	break;
679	4.97k	case RTF_ANSICPG:
680	4.97k	eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
681	4.97k	SetSrcEncoding(eCodeSet);
682	4.97k	break;
683	1.56M	default:
684	1.70M	NEXTTOKEN:
685	1.70M	NextToken( nToken );
686	1.70M	break;
687	1.80M	}
688	1.80M	if( IsParserWorking() )
689	1.80M	SaveState( 0 ); // processed till here,
690		// continue with new token!
691	1.80M	nToken = GetNextToken();
692	1.80M	bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
693	1.80M	}
694	9.84k	if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
695	8.61k	eState = SvParserState::Error;
696	9.84k	}
697
698		void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
699	50.9k	{
700	50.9k	if (eEnc == RTL_TEXTENCODING_DONTKNOW)
701	29.8k	eEnc = GetCodeSet();
702
703	50.9k	if (!aParserStates.empty())
704	50.6k	aParserStates.top().eCodeSet = eEnc;
705	50.9k	SetSrcEncoding(eEnc);
706	50.9k	}
707
708		/* vim:set shiftwidth=4 softtabstop=4 expandtab: */