Coverage Report

Created: 2025-11-16 09:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libreoffice/svtools/source/svhtml/parhtml.cxx
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
#include <comphelper/string.hxx>
21
#include <o3tl/safeint.hxx>
22
#include <o3tl/string_view.hxx>
23
#include <tools/stream.hxx>
24
#include <tools/debug.hxx>
25
#include <tools/color.hxx>
26
#include <rtl/ustrbuf.hxx>
27
#include <rtl/character.hxx>
28
#include <rtl/tencinfo.h>
29
#include <sal/log.hxx>
30
#include <tools/tenccvt.hxx>
31
#include <tools/datetime.hxx>
32
#include <unotools/datetime.hxx>
33
#include <svl/inettype.hxx>
34
#include <svl/lngmisc.hxx>
35
#include <com/sun/star/beans/PropertyAttribute.hpp>
36
#include <com/sun/star/document/XDocumentProperties.hpp>
37
38
#include <svtools/parhtml.hxx>
39
#include <svtools/htmltokn.h>
40
#include <svtools/htmlkywd.hxx>
41
42
#include <utility>
43
44
using namespace ::com::sun::star;
45
46
47
const sal_Int32 MAX_LEN( 1024 );
48
49
const sal_Int32 MAX_ENTITY_LEN( 8 );
50
51
52
// Tables to convert option values into strings
53
54
// <INPUT TYPE=xxx>
55
HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
56
{
57
    { OOO_STRING_SVTOOLS_HTML_IT_text,      HTMLInputType::Text        },
58
    { OOO_STRING_SVTOOLS_HTML_IT_password,  HTMLInputType::Password    },
59
    { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTMLInputType::Checkbox    },
60
    { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTMLInputType::Radio       },
61
    { OOO_STRING_SVTOOLS_HTML_IT_range,     HTMLInputType::Range       },
62
    { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTMLInputType::Scribble    },
63
    { OOO_STRING_SVTOOLS_HTML_IT_file,      HTMLInputType::File        },
64
    { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTMLInputType::Hidden      },
65
    { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTMLInputType::Submit      },
66
    { OOO_STRING_SVTOOLS_HTML_IT_image,     HTMLInputType::Image       },
67
    { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTMLInputType::Reset       },
68
    { OOO_STRING_SVTOOLS_HTML_IT_button,    HTMLInputType::Button      },
69
    { nullptr,                              HTMLInputType(0)    }
70
};
71
72
// <TABLE FRAME=xxx>
73
HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
74
{
75
    { OOO_STRING_SVTOOLS_HTML_TF_void,    HTMLTableFrame::Void    },
76
    { OOO_STRING_SVTOOLS_HTML_TF_above,   HTMLTableFrame::Above   },
77
    { OOO_STRING_SVTOOLS_HTML_TF_below,   HTMLTableFrame::Below   },
78
    { OOO_STRING_SVTOOLS_HTML_TF_hsides,  HTMLTableFrame::HSides  },
79
    { OOO_STRING_SVTOOLS_HTML_TF_lhs,     HTMLTableFrame::LHS     },
80
    { OOO_STRING_SVTOOLS_HTML_TF_rhs,     HTMLTableFrame::RHS     },
81
    { OOO_STRING_SVTOOLS_HTML_TF_vsides,  HTMLTableFrame::VSides  },
82
    { OOO_STRING_SVTOOLS_HTML_TF_box,     HTMLTableFrame::Box     },
83
    { OOO_STRING_SVTOOLS_HTML_TF_border,  HTMLTableFrame::Box     },
84
    { nullptr,                            HTMLTableFrame(0) }
85
};
86
87
// <TABLE RULES=xxx>
88
HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
89
{
90
    { OOO_STRING_SVTOOLS_HTML_TR_none,   HTMLTableRules::NONE      },
91
    { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups    },
92
    { OOO_STRING_SVTOOLS_HTML_TR_rows,   HTMLTableRules::Rows      },
93
    { OOO_STRING_SVTOOLS_HTML_TR_cols,   HTMLTableRules::Cols      },
94
    { OOO_STRING_SVTOOLS_HTML_TR_all,    HTMLTableRules::All       },
95
    { nullptr,                           HTMLTableRules(0) }
96
};
97
98
99
HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
100
                        OUString _aValue )
101
1.41M
    : aValue(std::move(_aValue))
102
1.41M
    , aToken(std::move(_aToken))
103
1.41M
    , nToken( nTok )
104
1.41M
{
105
1.41M
    DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
106
1.41M
        "HTMLOption: unknown token" );
107
1.41M
}
108
109
sal_uInt32 HTMLOption::GetNumber() const
110
94.4k
{
111
94.4k
    DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
112
94.4k
                 nToken<HtmlOptionId::NUMBER_END) ||
113
94.4k
                (nToken>=HtmlOptionId::CONTEXT_START &&
114
94.4k
                 nToken<HtmlOptionId::CONTEXT_END) ||
115
94.4k
                nToken==HtmlOptionId::VALUE,
116
94.4k
        "GetNumber: Option not numerical" );
117
94.4k
    OUString aTmp(comphelper::string::stripStart(aValue, ' '));
118
94.4k
    sal_Int32 nTmp = aTmp.toInt32();
119
94.4k
    return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
120
94.4k
}
121
122
sal_Int32 HTMLOption::GetSNumber() const
123
1.43k
{
124
1.43k
    DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
125
1.43k
                (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
126
1.43k
        "GetSNumber: Option not numerical" );
127
1.43k
    OUString aTmp(comphelper::string::stripStart(aValue, ' '));
128
1.43k
    return aTmp.toInt32();
129
1.43k
}
130
131
void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
132
2.05k
{
133
2.05k
    rNumbers.clear();
134
135
    // This is a very simplified scanner: it only searches all
136
    // numerals in the string.
137
2.05k
    bool bInNum = false;
138
2.05k
    sal_uInt32 nNum = 0;
139
30.2k
    for( sal_Int32 i=0; i<aValue.getLength(); i++ )
140
28.1k
    {
141
28.1k
        sal_Unicode c = aValue[ i ];
142
28.1k
        if( c>='0' && c<='9' )
143
16.1k
        {
144
16.1k
            nNum *= 10;
145
16.1k
            nNum += (c - '0');
146
16.1k
            bInNum = true;
147
16.1k
        }
148
12.0k
        else if( bInNum )
149
7.28k
        {
150
7.28k
            rNumbers.push_back( nNum );
151
7.28k
            bInNum = false;
152
7.28k
            nNum = 0;
153
7.28k
        }
154
28.1k
    }
155
2.05k
    if( bInNum )
156
1.82k
    {
157
1.82k
        rNumbers.push_back( nNum );
158
1.82k
    }
159
2.05k
}
160
161
void HTMLOption::GetColor( Color& rColor ) const
162
18.1k
{
163
18.1k
    DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
164
18.1k
        "GetColor: Option is not a color." );
165
166
18.1k
    OUString aTmp(aValue.toAsciiLowerCase());
167
18.1k
    sal_uInt32 nColor = SAL_MAX_UINT32;
168
18.1k
    if (!aTmp.isEmpty() && aTmp[0] != '#')
169
5.82k
        nColor = GetHTMLColor(aTmp);
170
171
18.1k
    if( SAL_MAX_UINT32 == nColor )
172
15.5k
    {
173
15.5k
        nColor = 0;
174
15.5k
        sal_Int32 nPos = 0;
175
108k
        for (sal_uInt32 i=0; i<6; ++i)
176
93.0k
        {
177
            // Whatever Netscape does to get color values,
178
            // at maximum three characters < '0' are ignored.
179
93.0k
            sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
180
93.0k
            if( c < '0' )
181
13.6k
            {
182
13.6k
                c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
183
13.6k
                if( c < '0' )
184
1.27k
                    c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
185
13.6k
            }
186
93.0k
            nColor *= 16;
187
93.0k
            if( c >= '0' && c <= '9' )
188
47.0k
                nColor += (c - '0');
189
46.0k
            else if( c >= 'a' && c <= 'f' )
190
36.0k
                nColor += (c + 0xa - 'a');
191
93.0k
        }
192
15.5k
    }
193
194
18.1k
    rColor.SetRed(   static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
195
18.1k
    rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
196
18.1k
    rColor.SetBlue(  static_cast<sal_uInt8>(nColor & 0x000000ff) );
197
18.1k
}
198
199
HTMLInputType HTMLOption::GetInputType() const
200
0
{
201
0
    DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
202
0
    return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
203
0
}
204
205
HTMLTableFrame HTMLOption::GetTableFrame() const
206
3.88k
{
207
3.88k
    DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
208
3.88k
    return GetEnum( aTableFrameOptEnums );
209
3.88k
}
210
211
HTMLTableRules HTMLOption::GetTableRules() const
212
766
{
213
766
    DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
214
766
    return GetEnum( aTableRulesOptEnums );
215
766
}
216
217
HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
218
42.7k
    SvParser<HtmlTokenId>( rIn ),
219
42.7k
    bNewDoc(bReadNewDoc),
220
42.7k
    bIsInHeader(true),
221
42.7k
    bReadListing(false),
222
42.7k
    bReadXMP(false),
223
42.7k
    bReadPRE(false),
224
42.7k
    bReadTextArea(false),
225
42.7k
    bReadScript(false),
226
42.7k
    bReadStyle(false),
227
42.7k
    bEndTokenFound(false),
228
42.7k
    bPre_IgnoreNewPara(false),
229
42.7k
    bReadNextChar(false),
230
42.7k
    bReadComment(false),
231
42.7k
    nPre_LinePos(0),
232
42.7k
    mnPendingOffToken(HtmlTokenId::NONE)
233
42.7k
{
234
    //#i76649, default to UTF-8 for HTML unless we know differently
235
42.7k
    SetSrcEncoding(RTL_TEXTENCODING_UTF8);
236
42.7k
}
237
238
HTMLParser::~HTMLParser()
239
42.7k
{
240
42.7k
}
241
242
void HTMLParser::SetNamespace(std::u16string_view rNamespace)
243
0
{
244
    // Convert namespace alias to a prefix.
245
0
    maNamespace = OUString::Concat(rNamespace) + ":";
246
0
}
247
248
namespace
249
{
250
    class RefGuard
251
    {
252
    private:
253
        HTMLParser& m_rParser;
254
    public:
255
        RefGuard(HTMLParser& rParser)
256
42.7k
            : m_rParser(rParser)
257
42.7k
        {
258
42.7k
            m_rParser.AddFirstRef();
259
42.7k
        }
260
261
        ~RefGuard()
262
42.7k
        {
263
42.7k
            if (m_rParser.GetStatus() != SvParserState::Pending)
264
42.7k
                m_rParser.ReleaseRef(); // Parser not needed anymore
265
42.7k
        }
266
    };
267
}
268
269
SvParserState HTMLParser::CallParser()
270
42.7k
{
271
42.7k
    eState = SvParserState::Working;
272
42.7k
    nNextCh = GetNextChar();
273
42.7k
    SaveState( HtmlTokenId::NONE );
274
275
42.7k
    nPre_LinePos = 0;
276
42.7k
    bPre_IgnoreNewPara = false;
277
278
42.7k
    RefGuard aRefGuard(*this);
279
280
42.7k
    Continue( HtmlTokenId::NONE );
281
282
42.7k
    return eState;
283
42.7k
}
284
285
void HTMLParser::Continue( HtmlTokenId nToken )
286
42.7k
{
287
42.7k
    if( nToken == HtmlTokenId::NONE )
288
42.7k
        nToken = GetNextToken();
289
290
4.83M
    while( IsParserWorking() )
291
4.79M
    {
292
4.79M
        SaveState( nToken );
293
4.79M
        nToken = FilterToken( nToken );
294
295
4.79M
        if( nToken != HtmlTokenId::NONE )
296
4.79M
            NextToken( nToken );
297
298
4.79M
        if( IsParserWorking() )
299
4.78M
            SaveState( HtmlTokenId::NONE );         // continue with new token
300
301
4.79M
        nToken = GetNextToken();
302
4.79M
    }
303
42.7k
}
304
305
HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
306
7.19M
{
307
7.19M
    switch( nToken )
308
7.19M
    {
309
0
    case HtmlTokenId(EOF):
310
0
        nToken = HtmlTokenId::NONE;
311
0
        break;          // don't pass
312
313
13.8k
    case HtmlTokenId::HEAD_OFF:
314
13.8k
        bIsInHeader = false;
315
13.8k
        break;
316
317
14.4k
    case HtmlTokenId::HEAD_ON:
318
14.4k
        bIsInHeader = true;
319
14.4k
        break;
320
321
17.2k
    case HtmlTokenId::BODY_ON:
322
17.2k
        bIsInHeader = false;
323
17.2k
        break;
324
325
804
    case HtmlTokenId::FRAMESET_ON:
326
804
        bIsInHeader = false;
327
804
        break;
328
329
6.17k
    case HtmlTokenId::BODY_OFF:
330
6.17k
        bReadPRE = bReadListing = bReadXMP = false;
331
6.17k
        break;
332
333
3.91k
    case HtmlTokenId::HTML_OFF:
334
3.91k
        nToken = HtmlTokenId::NONE;
335
3.91k
        bReadPRE = bReadListing = bReadXMP = false;
336
3.91k
        break;      // HtmlTokenId::ON hasn't been passed either !
337
338
34.6k
    case HtmlTokenId::PREFORMTXT_ON:
339
34.6k
        StartPRE();
340
34.6k
        break;
341
342
2.03k
    case HtmlTokenId::PREFORMTXT_OFF:
343
2.03k
        FinishPRE();
344
2.03k
        break;
345
346
938
    case HtmlTokenId::LISTING_ON:
347
938
        StartListing();
348
938
        break;
349
350
335
    case HtmlTokenId::LISTING_OFF:
351
335
        FinishListing();
352
335
        break;
353
354
8.51k
    case HtmlTokenId::XMP_ON:
355
8.51k
        StartXMP();
356
8.51k
        break;
357
358
314
    case HtmlTokenId::XMP_OFF:
359
314
        FinishXMP();
360
314
        break;
361
362
7.08M
    default:
363
7.08M
        if( bReadPRE )
364
1.00M
            nToken = FilterPRE( nToken );
365
6.08M
        else if( bReadListing )
366
20.3k
            nToken = FilterListing( nToken );
367
6.06M
        else if( bReadXMP )
368
130k
            nToken = FilterXMP( nToken );
369
370
7.08M
        break;
371
7.19M
    }
372
373
7.19M
    return nToken;
374
7.19M
}
375
376
namespace {
377
378
12.1M
constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
379
380
constexpr bool HTML_ISSPACE(sal_uInt32 c)
381
4.14M
{
382
4.14M
    return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
383
4.14M
}
384
385
}
386
387
HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
388
3.61M
{
389
3.61M
    OUStringBuffer sTmpBuffer( MAX_LEN );
390
3.61M
    bool bContinue = true;
391
3.61M
    bool bEqSignFound = false;
392
3.61M
    sal_uInt32  cQuote = 0U;
393
394
33.6M
    while( bContinue && IsParserWorking() )
395
30.0M
    {
396
30.0M
        bool bNextCh = true;
397
30.0M
        switch( nNextCh )
398
30.0M
        {
399
180k
        case '&':
400
180k
            bEqSignFound = false;
401
180k
            if( bReadXMP )
402
4.13k
                sTmpBuffer.append( '&' );
403
176k
            else
404
176k
            {
405
176k
                sal_uInt64 nStreamPos = rInput.Tell();
406
176k
                sal_uInt32 nLinePos = GetLinePos();
407
408
176k
                sal_uInt32 cChar = 0U;
409
176k
                if( '#' == (nNextCh = GetNextChar()) )
410
45.2k
                {
411
45.2k
                    nNextCh = GetNextChar();
412
45.2k
                    const bool bIsHex( 'x' == nNextCh );
413
45.2k
                    const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
414
45.2k
                    if ( bIsDecOrHex )
415
40.3k
                    {
416
40.3k
                        if ( bIsHex )
417
9.73k
                        {
418
9.73k
                            nNextCh = GetNextChar();
419
32.2k
                            while ( rtl::isAsciiHexDigit(nNextCh) )
420
22.4k
                            {
421
22.4k
                                cChar = cChar * 16U +
422
22.4k
                                        ( nNextCh <= '9'
423
22.4k
                                          ? sal_uInt32( nNextCh - '0' )
424
22.4k
                                          : ( nNextCh <= 'F'
425
20.2k
                                              ? sal_uInt32( nNextCh - 'A' + 10 )
426
20.2k
                                              : sal_uInt32( nNextCh - 'a' + 10 ) ) );
427
22.4k
                                nNextCh = GetNextChar();
428
22.4k
                            }
429
9.73k
                        }
430
30.5k
                        else
431
30.5k
                        {
432
30.5k
                            do
433
116k
                            {
434
116k
                                cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
435
116k
                                nNextCh = GetNextChar();
436
116k
                            }
437
116k
                            while( rtl::isAsciiDigit(nNextCh) );
438
30.5k
                        }
439
440
40.3k
                        if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
441
38.2k
                            RTL_TEXTENCODING_UCS2 != eSrcEnc &&
442
38.1k
                            RTL_TEXTENCODING_UTF8 != eSrcEnc &&
443
11.0k
                            cChar < 256 )
444
6.06k
                        {
445
6.06k
                            const sal_uInt32 convertFlags =
446
6.06k
                                RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
447
6.06k
                                RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
448
6.06k
                                RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
449
450
6.06k
                            char cEncodedChar = static_cast<char>(cChar);
451
6.06k
                            cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
452
6.06k
                            if( 0U == cChar )
453
346
                            {
454
                                // If the character could not be
455
                                // converted, because a conversion is not
456
                                // available, do no conversion at all.
457
346
                                cChar = cEncodedChar;
458
346
                            }
459
6.06k
                        }
460
40.3k
                    }
461
4.92k
                    else
462
4.92k
                        nNextCh = 0U;
463
464
45.2k
                    if (!rtl::isUnicodeCodePoint(cChar)
465
44.1k
                        || (linguistic::IsControlChar(cChar)
466
20.9k
                            && cChar != '\r' && cChar != '\n' && cChar != '\t'))
467
9.19k
                    {
468
9.19k
                        cChar = '?';
469
9.19k
                    }
470
45.2k
                }
471
130k
                else if( rtl::isAsciiAlpha( nNextCh ) )
472
96.7k
                {
473
96.7k
                    OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
474
96.7k
                    sal_Int32 nPos = 0;
475
96.7k
                    do
476
336k
                    {
477
336k
                        sEntityBuffer.appendUtf32( nNextCh );
478
336k
                        nPos++;
479
336k
                        nNextCh = GetNextChar();
480
336k
                    }
481
336k
                    while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
482
239k
                           !rInput.eof() );
483
484
96.7k
                    if( IsParserWorking() && !rInput.eof() )
485
96.4k
                    {
486
96.4k
                        std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
487
96.4k
                        cChar = GetHTMLCharName( sEntity );
488
489
                        // not found ( == 0 ): plain text
490
                        // or a character which is inserted as attribute
491
96.4k
                        if( 0U == cChar && ';' != nNextCh )
492
18.6k
                        {
493
18.6k
                            DBG_ASSERT( rInput.Tell() - nStreamPos ==
494
18.6k
                                        static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
495
18.6k
                                        "UTF-8 is failing here" );
496
46.9k
                            for( sal_Int32 i = nPos-1; i>1; i-- )
497
29.6k
                            {
498
29.6k
                                nNextCh = sEntityBuffer[i];
499
29.6k
                                sEntityBuffer.setLength( i );
500
29.6k
                                sEntity = sEntityBuffer.subView(0, i);
501
29.6k
                                cChar = GetHTMLCharName( sEntity );
502
29.6k
                                if( cChar )
503
1.35k
                                {
504
1.35k
                                    rInput.SeekRel( -static_cast<sal_Int64>
505
1.35k
                                            (nPos-i)*GetCharSize() );
506
1.35k
                                    nlLinePos -= sal_uInt32(nPos-i);
507
1.35k
                                    nPos = i;
508
1.35k
                                    ClearTxtConvContext();
509
1.35k
                                    break;
510
1.35k
                                }
511
29.6k
                            }
512
18.6k
                        }
513
514
96.4k
                        if( !cChar )        // unknown character?
515
25.6k
                        {
516
                            // back in stream, insert '&'
517
                            // and restart with next character
518
25.6k
                            sTmpBuffer.append( '&' );
519
520
25.6k
                            DBG_ASSERT( rInput.Tell()-nStreamPos ==
521
25.6k
                                        static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
522
25.6k
                                        "Wrong stream position" );
523
25.6k
                            DBG_ASSERT( nlLinePos-nLinePos ==
524
25.6k
                                        static_cast<sal_uInt32>(nPos+1),
525
25.6k
                                        "Wrong line position" );
526
25.6k
                            rInput.Seek( nStreamPos );
527
25.6k
                            nlLinePos = nLinePos;
528
25.6k
                            ClearTxtConvContext();
529
25.6k
                            break;
530
25.6k
                        }
531
532
96.4k
                        assert(cChar != 0);
533
534
                        // 1 == Non Breaking Space
535
                        // 2 == SoftHyphen
536
537
70.8k
                        if (cChar == 1 || cChar == 2)
538
35.6k
                        {
539
35.6k
                            if( '>' == cBreak )
540
2.05k
                            {
541
                                // When reading the content of a tag we have
542
                                // to change it to ' ' or '-'
543
2.05k
                                if( 1U == cChar )
544
1.21k
                                    cChar = ' ';
545
843
                                else //2U
546
843
                                    cChar = '-';
547
2.05k
                            }
548
33.6k
                            else
549
33.6k
                            {
550
                                // If not scanning a tag return token
551
33.6k
                                aToken.append( sTmpBuffer );
552
33.6k
                                sTmpBuffer.setLength(0);
553
554
33.6k
                                if( !aToken.isEmpty() )
555
9.32k
                                {
556
                                    // restart with character
557
9.32k
                                    nNextCh = '&';
558
9.32k
                                    DBG_ASSERT( rInput.Tell()-nStreamPos ==
559
9.32k
                                                static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
560
9.32k
                                                "Wrong stream position" );
561
9.32k
                                    DBG_ASSERT( nlLinePos-nLinePos ==
562
9.32k
                                                static_cast<sal_uInt32>(nPos+1),
563
9.32k
                                                "Wrong line position" );
564
9.32k
                                    rInput.Seek( nStreamPos );
565
9.32k
                                    nlLinePos = nLinePos;
566
9.32k
                                    ClearTxtConvContext();
567
9.32k
                                    return HtmlTokenId::TEXTTOKEN;
568
9.32k
                                }
569
570
                                // Hack: _GetNextChar shall not read the
571
                                // next character
572
24.2k
                                if( ';' != nNextCh )
573
915
                                    aToken.append( " " );
574
24.2k
                                if( 1U == cChar )
575
23.0k
                                    return HtmlTokenId::NONBREAKSPACE;
576
1.23k
                                else //2U
577
1.23k
                                    return HtmlTokenId::SOFTHYPH;
578
24.2k
                            }
579
35.6k
                        }
580
70.8k
                    }
581
267
                    else
582
267
                        nNextCh = 0U;
583
96.7k
                }
584
                // &{...};-JavaScript-Macros are not supported any longer.
585
34.0k
                else if( IsParserWorking() )
586
34.0k
                {
587
34.0k
                    sTmpBuffer.append( '&' );
588
34.0k
                    bNextCh = false;
589
34.0k
                    break;
590
34.0k
                }
591
592
82.7k
                bNextCh = (';' == nNextCh);
593
82.7k
                if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
594
41.0k
                                    cChar=='\"' || cChar==' ') )
595
14.6k
                {
596
                    // ' and " have to be escaped within tags to separate
597
                    // them from ' and " enclosing options.
598
                    // \ has to be escaped as well.
599
                    // Space is protected because it's not a delimiter between
600
                    // options.
601
14.6k
                    sTmpBuffer.append( '\\' );
602
14.6k
                }
603
82.7k
                if( IsParserWorking() )
604
82.7k
                {
605
82.7k
                    if( cChar )
606
82.4k
                        sTmpBuffer.appendUtf32( cChar );
607
82.7k
                }
608
0
                else if( SvParserState::Pending==eState && '>'!=cBreak )
609
0
                {
610
                    // Restart with '&', the remainder is returned as
611
                    // text token.
612
0
                    if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
613
0
                    {
614
                        // _GetNextChar() returns the previous text and
615
                        // during the next execution a new character is read.
616
                        // Thus we have to position in front of the '&'.
617
0
                        nNextCh = 0U;
618
0
                        rInput.Seek( nStreamPos - GetCharSize() );
619
0
                        nlLinePos = nLinePos-1;
620
0
                        ClearTxtConvContext();
621
0
                        bReadNextChar = true;
622
0
                    }
623
0
                    bNextCh = false;
624
0
                }
625
82.7k
            }
626
86.8k
            break;
627
1.76M
        case '=':
628
1.76M
            if( '>'==cBreak && !cQuote )
629
1.25M
                bEqSignFound = true;
630
1.76M
            sTmpBuffer.appendUtf32( nNextCh );
631
1.76M
            break;
632
633
105k
        case '\\':
634
105k
            if( '>'==cBreak )
635
67.7k
            {
636
                // mark within tags
637
67.7k
                sTmpBuffer.append( '\\' );
638
67.7k
            }
639
105k
            sTmpBuffer.append( '\\' );
640
105k
            break;
641
642
2.19M
        case '\"':
643
2.41M
        case '\'':
644
2.41M
            if( '>'==cBreak )
645
2.09M
            {
646
2.09M
                if( bEqSignFound )
647
924k
                    cQuote = nNextCh;
648
1.16M
                else if( cQuote && (cQuote==nNextCh ) )
649
919k
                    cQuote = 0U;
650
2.09M
            }
651
2.41M
            sTmpBuffer.appendUtf32( nNextCh );
652
2.41M
            bEqSignFound = false;
653
2.41M
            break;
654
655
2.42M
        case sal_Unicode(EOF):
656
2.42M
            if( rInput.eof() )
657
20.0k
            {
658
20.0k
                bContinue = false;
659
20.0k
            }
660
            // else: ignore, not a valid code point
661
2.42M
            break;
662
663
2.03M
        case '<':
664
2.03M
            bEqSignFound = false;
665
2.03M
            if( '>'==cBreak )
666
758k
                sTmpBuffer.appendUtf32( nNextCh );
667
1.27M
            else
668
1.27M
                bContinue = false;      // break, string is together
669
2.03M
            break;
670
671
68.3k
        case '\f':
672
68.3k
            if( '>' == cBreak )
673
63.0k
            {
674
                // If scanning options treat it like a space, ...
675
63.0k
                sTmpBuffer.append( ' ' );
676
63.0k
            }
677
5.35k
            else
678
5.35k
            {
679
                // otherwise it's a separate token.
680
5.35k
                bContinue = false;
681
5.35k
            }
682
68.3k
            break;
683
684
245k
        case '\r':
685
885k
        case '\n':
686
885k
            if( '>'==cBreak )
687
301k
            {
688
                // cr/lf in tag is handled in GetNextToken_()
689
301k
                sTmpBuffer.appendUtf32( nNextCh );
690
301k
                break;
691
301k
            }
692
584k
            else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
693
80.2k
            {
694
80.2k
                bContinue = false;
695
80.2k
                break;
696
80.2k
            }
697
            // Reduce sequence of CR/LF/BLANK/TAB to a single blank
698
503k
            [[fallthrough]];
699
634k
        case '\t':
700
634k
            if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
701
18.0k
            {
702
                // Pass Tabs up in <PRE>
703
18.0k
                bContinue = false;
704
18.0k
                break;
705
18.0k
            }
706
616k
            [[fallthrough]];
707
639k
        case '\x0b':
708
639k
            if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
709
3.20k
                '>'!=cBreak )
710
2.06k
            {
711
2.06k
                break;
712
2.06k
            }
713
637k
            if (!m_bPreserveSpaces)
714
632k
                nNextCh = ' ';
715
637k
            [[fallthrough]];
716
2.70M
        case ' ':
717
2.70M
            if (!m_bPreserveSpaces)
718
2.69M
            {
719
2.69M
                sTmpBuffer.appendUtf32(nNextCh);
720
2.69M
                if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
721
1.12M
                {
722
                    // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
723
1.12M
                    do
724
2.30M
                    {
725
2.30M
                        nNextCh = GetNextChar();
726
2.30M
                        if (sal_Unicode(EOF) == nNextCh && rInput.eof())
727
2.86k
                        {
728
2.86k
                            if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
729
1.16k
                            {
730
                                // Have seen s.th. aside from blanks?
731
1.16k
                                aToken.append(sTmpBuffer);
732
1.16k
                                sTmpBuffer.setLength(0);
733
1.16k
                                return HtmlTokenId::TEXTTOKEN;
734
1.16k
                            }
735
1.70k
                            else
736
                                // Only read blanks: no text must be returned
737
                                // and GetNextToken_ has to read until EOF
738
1.70k
                                return HtmlTokenId::NONE;
739
2.86k
                        }
740
2.30M
                    } while (HTML_ISSPACE(nNextCh));
741
1.12M
                    bNextCh = false;
742
1.12M
                }
743
2.68M
                break;
744
2.69M
            }
745
14.2k
            [[fallthrough]];
746
17.9M
        default:
747
17.9M
            bEqSignFound = false;
748
17.9M
            if (nNextCh == cBreak && !cQuote)
749
2.13M
                bContinue = false;
750
15.8M
            else
751
15.8M
            {
752
45.2M
                do {
753
45.2M
                    if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
754
43.4M
                    {
755
                    // All remaining characters make their way into the text.
756
43.4M
                        sTmpBuffer.appendUtf32( nNextCh );
757
43.4M
                    }
758
759
45.2M
                    nNextCh = GetNextChar();
760
45.2M
                    if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
761
45.2M
                        !IsParserWorking() )
762
38.9k
                    {
763
38.9k
                        if( !sTmpBuffer.isEmpty() )
764
38.2k
                            aToken.append( sTmpBuffer );
765
38.9k
                        return HtmlTokenId::TEXTTOKEN;
766
38.9k
                    }
767
45.2M
                } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
768
15.8M
                bNextCh = false;
769
15.8M
            }
770
30.0M
        }
771
772
29.9M
        if( bContinue && bNextCh )
773
9.47M
            nNextCh = GetNextChar();
774
29.9M
    }
775
776
3.54M
    if( !sTmpBuffer.isEmpty() )
777
2.83M
        aToken.append( sTmpBuffer );
778
779
3.54M
    return HtmlTokenId::TEXTTOKEN;
780
3.61M
}
781
782
HtmlTokenId HTMLParser::GetNextRawToken()
783
165k
{
784
165k
    OUStringBuffer sTmpBuffer( MAX_LEN );
785
786
165k
    if( bEndTokenFound )
787
7.86k
    {
788
        // During the last execution we already found the end token,
789
        // thus we don't have to search it again.
790
7.86k
        bReadScript = false;
791
7.86k
        bReadStyle = false;
792
7.86k
        aEndToken.clear();
793
7.86k
        bEndTokenFound = false;
794
795
7.86k
        return HtmlTokenId::NONE;
796
7.86k
    }
797
798
    // Default return value: HtmlTokenId::RAWDATA
799
157k
    bool bContinue = true;
800
157k
    HtmlTokenId nToken = HtmlTokenId::RAWDATA;
801
157k
    SaveState( HtmlTokenId::NONE );
802
4.43M
    while( bContinue && IsParserWorking() )
803
4.28M
    {
804
4.28M
        bool bNextCh = true;
805
4.28M
        switch( nNextCh )
806
4.28M
        {
807
121k
        case '<':
808
121k
            {
809
                // Maybe we've reached the end.
810
811
                // Save what we have read previously...
812
121k
                aToken.append( sTmpBuffer );
813
121k
                sTmpBuffer.setLength(0);
814
815
                // and remember position in stream.
816
121k
                sal_uInt64 nStreamPos = rInput.Tell();
817
121k
                sal_uInt32 nLineNr = GetLineNr();
818
121k
                sal_uInt32 nLinePos = GetLinePos();
819
820
                // Start of an end token?
821
121k
                bool bOffState = false;
822
121k
                if( '/' == (nNextCh = GetNextChar()) )
823
34.6k
                {
824
34.6k
                    bOffState = true;
825
34.6k
                    nNextCh = GetNextChar();
826
34.6k
                }
827
86.7k
                else if( '!' == nNextCh )
828
4.93k
                {
829
4.93k
                    sTmpBuffer.appendUtf32( nNextCh );
830
4.93k
                    nNextCh = GetNextChar();
831
4.93k
                }
832
833
                // Read following letters
834
518k
                while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
835
396k
                       IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
836
396k
                {
837
396k
                    sTmpBuffer.appendUtf32( nNextCh );
838
396k
                    nNextCh = GetNextChar();
839
396k
                }
840
841
121k
                OUString aTok( sTmpBuffer.toString() );
842
121k
                aTok = aTok.toAsciiLowerCase();
843
121k
                bool bDone = false;
844
121k
                if( bReadScript || !aEndToken.isEmpty() )
845
22.6k
                {
846
22.6k
                    if( !bReadComment )
847
13.4k
                    {
848
13.4k
                        if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
849
633
                        {
850
633
                            bReadComment = true;
851
633
                        }
852
12.7k
                        else
853
12.7k
                        {
854
                            // A script has to end with "</SCRIPT>". But
855
                            // ">" is optional for security reasons
856
12.7k
                            bDone = bOffState &&
857
4.68k
                            ( bReadScript
858
4.68k
                                ? aTok == OOO_STRING_SVTOOLS_HTML_script
859
4.68k
                                : aTok == aEndToken );
860
12.7k
                        }
861
13.4k
                    }
862
22.6k
                    if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
863
103
                    {
864
                        // End of comment of style <!----->
865
103
                        bReadComment = false;
866
103
                    }
867
22.6k
                }
868
98.7k
                else
869
98.7k
                {
870
                    // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
871
98.7k
                    if( bOffState )
872
27.5k
                        bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
873
15.9k
                                aTok == OOO_STRING_SVTOOLS_HTML_head;
874
71.2k
                    else
875
71.2k
                        bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
876
98.7k
                }
877
878
121k
                if( bDone )
879
15.2k
                {
880
                    // Done! Return the previously read string (if requested)
881
                    // and continue.
882
883
15.2k
                    bContinue = false;
884
885
                    // nToken==0 means, GetNextToken_ continues to read
886
15.2k
                    if( aToken.isEmpty() && (bReadStyle || bReadScript) )
887
8.15k
                    {
888
                        // Immediately close environment (or context?)
889
                        // and parse the end token
890
8.15k
                        bReadScript = false;
891
8.15k
                        bReadStyle = false;
892
8.15k
                        aEndToken.clear();
893
8.15k
                        nToken = HtmlTokenId::NONE;
894
8.15k
                    }
895
7.12k
                    else
896
7.12k
                    {
897
                        // Keep bReadScript/bReadStyle alive
898
                        // and parse end token during next execution
899
7.12k
                        bEndTokenFound = true;
900
7.12k
                    }
901
902
                    // Move backwards in stream to '<'
903
15.2k
                    rInput.Seek( nStreamPos );
904
15.2k
                    SetLineNr( nLineNr );
905
15.2k
                    SetLinePos( nLinePos );
906
15.2k
                    ClearTxtConvContext();
907
15.2k
                    nNextCh = '<';
908
909
                    // Don't append string to token.
910
15.2k
                    sTmpBuffer.setLength( 0 );
911
15.2k
                }
912
106k
                else
913
106k
                {
914
                    // remember "</" , everything else we find in the buffer
915
106k
                    aToken.append( "<" );
916
106k
                    if( bOffState )
917
20.3k
                        aToken.append( "/" );
918
919
106k
                    bNextCh = false;
920
106k
                }
921
121k
            }
922
121k
            break;
923
77.2k
        case '-':
924
77.2k
            sTmpBuffer.appendUtf32( nNextCh );
925
77.2k
            if( bReadComment )
926
2.74k
            {
927
2.74k
                bool bTwoMinus = false;
928
2.74k
                nNextCh = GetNextChar();
929
3.73k
                while( '-' == nNextCh && IsParserWorking() )
930
994
                {
931
994
                    bTwoMinus = true;
932
994
                    sTmpBuffer.appendUtf32( nNextCh );
933
994
                    nNextCh = GetNextChar();
934
994
                }
935
936
2.74k
                if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
937
468
                    bReadComment = false;
938
939
2.74k
                bNextCh = false;
940
2.74k
            }
941
77.2k
            break;
942
943
31.4k
        case '\r':
944
            // \r\n? closes the current text token (even if it's empty)
945
31.4k
            nNextCh = GetNextChar();
946
31.4k
            if( nNextCh=='\n' )
947
29.7k
                nNextCh = GetNextChar();
948
31.4k
            bContinue = false;
949
31.4k
            break;
950
110k
        case '\n':
951
            // \n closes the current text token (even if it's empty)
952
110k
            nNextCh = GetNextChar();
953
110k
            bContinue = false;
954
110k
            break;
955
56.3k
        case sal_Unicode(EOF):
956
            // eof closes the current text token and behaves like having read
957
            // an end token
958
56.3k
            if( rInput.eof() )
959
810
            {
960
810
                bContinue = false;
961
810
                if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
962
739
                {
963
739
                    bEndTokenFound = true;
964
739
                }
965
71
                else
966
71
                {
967
71
                    bReadScript = false;
968
71
                    bReadStyle = false;
969
71
                    aEndToken.clear();
970
71
                    nToken = HtmlTokenId::NONE;
971
71
                }
972
810
            }
973
56.3k
            break;
974
3.88M
        default:
975
3.88M
            if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
976
3.77M
            {
977
                // all remaining characters are appended to the buffer
978
3.77M
                sTmpBuffer.appendUtf32( nNextCh );
979
3.77M
            }
980
3.88M
            break;
981
4.28M
        }
982
983
4.28M
        if( !bContinue && !sTmpBuffer.isEmpty() )
984
122k
        {
985
122k
            aToken.append( sTmpBuffer );
986
122k
            sTmpBuffer.setLength(0);
987
122k
        }
988
989
4.28M
        if( bContinue && bNextCh )
990
4.01M
            nNextCh = GetNextChar();
991
4.28M
    }
992
993
157k
    if( IsParserWorking() )
994
157k
        SaveState( HtmlTokenId::NONE );
995
0
    else
996
0
        nToken = HtmlTokenId::NONE;
997
998
157k
    return nToken;
999
157k
}
1000
1001
// Scan next token
1002
HtmlTokenId HTMLParser::GetNextToken_()
1003
7.20M
{
1004
7.20M
    HtmlTokenId nRet = HtmlTokenId::NONE;
1005
7.20M
    sSaveToken.clear();
1006
1007
7.20M
    if (mnPendingOffToken != HtmlTokenId::NONE)
1008
257k
    {
1009
        // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1010
257k
        nRet = mnPendingOffToken;
1011
257k
        mnPendingOffToken = HtmlTokenId::NONE;
1012
257k
        aToken.setLength( 0 );
1013
257k
        return nRet;
1014
257k
    }
1015
1016
    // Delete options
1017
6.94M
    maOptions.clear();
1018
1019
6.94M
    if( !IsParserWorking() )        // Don't continue if already an error occurred
1020
247k
        return HtmlTokenId::NONE;
1021
1022
6.69M
    bool bReadNextCharSave = bReadNextChar;
1023
6.69M
    if( bReadNextChar )
1024
3.23k
    {
1025
3.23k
        DBG_ASSERT( !bEndTokenFound,
1026
3.23k
                    "Read a character despite </SCRIPT> was read?" );
1027
3.23k
        nNextCh = GetNextChar();
1028
3.23k
        if( !IsParserWorking() )        // Don't continue if already an error occurred
1029
0
            return HtmlTokenId::NONE;
1030
3.23k
        bReadNextChar = false;
1031
3.23k
    }
1032
1033
6.69M
    if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1034
165k
    {
1035
165k
        nRet = GetNextRawToken();
1036
165k
        if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1037
149k
            return nRet;
1038
165k
    }
1039
1040
6.56M
    do {
1041
6.56M
        bool bNextCh = true;
1042
6.56M
        switch( nNextCh )
1043
6.56M
        {
1044
3.74M
        case '<':
1045
3.74M
            {
1046
3.74M
                sal_uInt64 nStreamPos = rInput.Tell();
1047
3.74M
                sal_uInt32 nLineNr = GetLineNr();
1048
3.74M
                sal_uInt32 nLinePos = GetLinePos();
1049
1050
3.74M
                bool bOffState = false;
1051
3.74M
                if( '/' == (nNextCh = GetNextChar()) )
1052
730k
                {
1053
730k
                    bOffState = true;
1054
730k
                    nNextCh = GetNextChar();
1055
730k
                }
1056
                // Assume '<?' is a start of an XML declaration, ignore it.
1057
3.74M
                if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1058
3.57M
                {
1059
3.57M
                    OUStringBuffer sTmpBuffer;
1060
12.6M
                    do {
1061
12.6M
                        sTmpBuffer.appendUtf32( nNextCh );
1062
12.6M
                        nNextCh = GetNextChar();
1063
12.6M
                        if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1064
1.30k
                            break;
1065
12.6M
                        if (bFuzzing && sTmpBuffer.getLength() > 1024)
1066
50
                        {
1067
50
                            SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1068
50
                            eState = SvParserState::Error;
1069
50
                            break;
1070
50
                        }
1071
12.6M
                    } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1072
9.10M
                            !linguistic::IsControlChar(nNextCh) &&
1073
9.06M
                             IsParserWorking() && !rInput.eof() );
1074
1075
3.57M
                    if( !sTmpBuffer.isEmpty() )
1076
3.57M
                    {
1077
3.57M
                        aToken.append( sTmpBuffer );
1078
3.57M
                        sTmpBuffer.setLength(0);
1079
3.57M
                    }
1080
1081
                    // Skip blanks
1082
4.85M
                    while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1083
1.27M
                        nNextCh = GetNextChar();
1084
1085
3.57M
                    if( !IsParserWorking() )
1086
50
                    {
1087
50
                        if( SvParserState::Pending == eState )
1088
0
                            bReadNextChar = bReadNextCharSave;
1089
50
                        break;
1090
50
                    }
1091
1092
                    // Search token in table:
1093
3.57M
                    sSaveToken = aToken;
1094
3.57M
                    aToken = aToken.toString().toAsciiLowerCase();
1095
1096
3.57M
                    if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1097
0
                        aToken.remove( 0, maNamespace.getLength());
1098
1099
3.57M
                    if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1100
                        // Unknown control
1101
395k
                        nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1102
1103
                    // If it's a token which can be switched off...
1104
3.57M
                    if( bOffState )
1105
718k
                    {
1106
718k
                         if( nRet >= HtmlTokenId::ONOFF_START )
1107
716k
                         {
1108
                            // and there is an off token, return off token instead
1109
716k
                            nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1110
716k
                         }
1111
1.63k
                         else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1112
925
                         {
1113
                            // and there is no off token, return unknown token.
1114
                            // (except for </BR>, that is treated like <BR>)
1115
                            // No exception for XHTML, though.
1116
925
                            nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
1117
925
                         }
1118
718k
                    }
1119
1120
3.57M
                    if( nRet == HtmlTokenId::COMMENT )
1121
8.57k
                    {
1122
                        // fix: due to being case sensitive use sSaveToken as start of comment
1123
                        //      and append a blank.
1124
8.57k
                        aToken = sSaveToken;
1125
8.57k
                        if( '>'!=nNextCh )
1126
5.75k
                            aToken.append( " " );
1127
8.57k
                        sal_uInt64 nCStreamPos = 0;
1128
8.57k
                        sal_uInt32 nCLineNr = 0;
1129
8.57k
                        sal_uInt32 nCLinePos = 0;
1130
8.57k
                        sal_Int32 nCStrLen = 0;
1131
1132
8.57k
                        bool bDone = false;
1133
                        // Read until closing -->. If not found restart at first >
1134
8.57k
                        sTmpBuffer = aToken;
1135
1.27M
                        while( !bDone && !rInput.eof() && IsParserWorking() )
1136
1.27M
                        {
1137
1.27M
                            if( '>'==nNextCh )
1138
61.6k
                            {
1139
61.6k
                                if( !nCStreamPos )
1140
7.53k
                                {
1141
7.53k
                                    nCStreamPos = rInput.Tell();
1142
7.53k
                                    nCStrLen = sTmpBuffer.getLength();
1143
7.53k
                                    nCLineNr = GetLineNr();
1144
7.53k
                                    nCLinePos = GetLinePos();
1145
7.53k
                                }
1146
61.6k
                                bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1147
61.6k
                                if( !bDone )
1148
54.5k
                                    sTmpBuffer.appendUtf32(nNextCh);
1149
61.6k
                            }
1150
1.20M
                            else if (!linguistic::IsControlChar(nNextCh)
1151
77.1k
                                || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1152
1.17M
                            {
1153
1.17M
                                sTmpBuffer.appendUtf32(nNextCh);
1154
1.17M
                            }
1155
1.27M
                            if( !bDone )
1156
1.26M
                                nNextCh = GetNextChar();
1157
1.27M
                        }
1158
8.57k
                        aToken = sTmpBuffer;
1159
8.57k
                        sTmpBuffer.setLength(0);
1160
8.57k
                        if( !bDone && IsParserWorking() && nCStreamPos )
1161
428
                        {
1162
428
                            rInput.Seek( nCStreamPos );
1163
428
                            SetLineNr( nCLineNr );
1164
428
                            SetLinePos( nCLinePos );
1165
428
                            ClearTxtConvContext();
1166
428
                            aToken.truncate(nCStrLen);
1167
428
                            nNextCh = '>';
1168
428
                        }
1169
8.57k
                    }
1170
3.57M
                    else if (nRet == HtmlTokenId::CDATA)
1171
1.30k
                    {
1172
                        // Read until the closing ]]>.
1173
1.30k
                        bool bDone = false;
1174
300k
                        while (!bDone && !rInput.eof() && IsParserWorking())
1175
298k
                        {
1176
298k
                            if (nNextCh == '>')
1177
11.8k
                            {
1178
11.8k
                                if (sTmpBuffer.getLength() >= 2)
1179
11.7k
                                {
1180
11.7k
                                    bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1181
1.11k
                                            && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1182
11.7k
                                    if (bDone)
1183
1.06k
                                    {
1184
                                        // Ignore ]] at the end.
1185
1.06k
                                        sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1186
1.06k
                                    }
1187
11.7k
                                }
1188
11.8k
                                if (!bDone)
1189
10.7k
                                {
1190
10.7k
                                    sTmpBuffer.appendUtf32(nNextCh);
1191
10.7k
                                }
1192
11.8k
                            }
1193
287k
                            else if (!linguistic::IsControlChar(nNextCh))
1194
222k
                            {
1195
222k
                                sTmpBuffer.appendUtf32(nNextCh);
1196
222k
                            }
1197
298k
                            if (!bDone)
1198
297k
                            {
1199
297k
                                nNextCh = GetNextChar();
1200
297k
                            }
1201
298k
                        }
1202
1.30k
                        aToken = sTmpBuffer;
1203
1.30k
                        sTmpBuffer.setLength(0);
1204
1.30k
                    }
1205
3.56M
                    else
1206
3.56M
                    {
1207
                        // TokenString not needed anymore
1208
3.56M
                        aToken.setLength( 0 );
1209
3.56M
                    }
1210
1211
                    // Read until closing '>'
1212
3.57M
                    if( '>' != nNextCh && IsParserWorking() )
1213
1.30M
                    {
1214
1.30M
                        ScanText( '>' );
1215
1216
                        // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217
                        // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1218
                        // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1219
                        // which lead to fdo#56772.
1220
1.30M
                        if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1221
257k
                        {
1222
257k
                            mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);       // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1223
257k
                            aToken.setLength( aToken.getLength()-1 );   // remove trailing '/'
1224
257k
                        }
1225
1.30M
                        if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1226
27.5k
                        {
1227
                            // Move back in front of < and restart there.
1228
                            // Return < as text.
1229
27.5k
                            rInput.Seek( nStreamPos );
1230
27.5k
                            SetLineNr( nLineNr );
1231
27.5k
                            SetLinePos( nLinePos );
1232
27.5k
                            ClearTxtConvContext();
1233
1234
27.5k
                            aToken = "<";
1235
27.5k
                            nRet = HtmlTokenId::TEXTTOKEN;
1236
27.5k
                            nNextCh = GetNextChar();
1237
27.5k
                            bNextCh = false;
1238
27.5k
                            break;
1239
27.5k
                        }
1240
1.30M
                    }
1241
3.55M
                    if( SvParserState::Pending == eState )
1242
0
                        bReadNextChar = bReadNextCharSave;
1243
3.55M
                }
1244
163k
                else
1245
163k
                {
1246
163k
                    if( bOffState )
1247
11.8k
                    {
1248
                        // simply throw away everything
1249
11.8k
                        ScanText( '>' );
1250
11.8k
                        if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1251
827
                        {
1252
                            // Move back in front of < and restart there.
1253
                            // Return < as text.
1254
827
                            rInput.Seek( nStreamPos );
1255
827
                            SetLineNr( nLineNr );
1256
827
                            SetLinePos( nLinePos );
1257
827
                            ClearTxtConvContext();
1258
1259
827
                            aToken = "<";
1260
827
                            nRet = HtmlTokenId::TEXTTOKEN;
1261
827
                            nNextCh = GetNextChar();
1262
827
                            bNextCh = false;
1263
827
                            break;
1264
827
                        }
1265
11.0k
                        if( SvParserState::Pending == eState )
1266
0
                            bReadNextChar = bReadNextCharSave;
1267
11.0k
                        aToken.setLength( 0 );
1268
11.0k
                    }
1269
151k
                    else if( '%' == nNextCh )
1270
1.60k
                    {
1271
1.60k
                        nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1272
1273
1.60k
                        sal_uInt64 nCStreamPos = rInput.Tell();
1274
1.60k
                        sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1275
1276
1.60k
                        bool bDone = false;
1277
                        // Read until closing %>. If not found restart at first >.
1278
1.60k
                        sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1279
1.60k
                        OUStringBuffer aTmpBuffer(aToken);
1280
2.39M
                        while( !bDone && !rInput.eof() && IsParserWorking() )
1281
2.39M
                        {
1282
2.39M
                            bDone = '>'==nNextCh && nLastTokenChar == '%';
1283
2.39M
                            if( !bDone )
1284
2.39M
                            {
1285
2.39M
                                aTmpBuffer.appendUtf32(nNextCh);
1286
2.39M
                                nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1287
2.39M
                                nNextCh = GetNextChar();
1288
2.39M
                            }
1289
2.39M
                        }
1290
1.60k
                        if( !bDone && IsParserWorking() )
1291
1.01k
                        {
1292
1.01k
                            rInput.Seek( nCStreamPos );
1293
1.01k
                            SetLineNr( nCLineNr );
1294
1.01k
                            SetLinePos( nCLinePos );
1295
1.01k
                            ClearTxtConvContext();
1296
1.01k
                            aToken = "<%";
1297
1.01k
                            nRet = HtmlTokenId::TEXTTOKEN;
1298
1.01k
                            break;
1299
1.01k
                        }
1300
596
                        aToken = aTmpBuffer;
1301
596
                        aTmpBuffer.setLength(0);
1302
596
                        if( IsParserWorking() )
1303
596
                        {
1304
596
                            sSaveToken = aToken;
1305
596
                            aToken.setLength( 0 );
1306
596
                        }
1307
596
                    }
1308
149k
                    else
1309
149k
                    {
1310
149k
                        aToken = "<";
1311
149k
                        nRet = HtmlTokenId::TEXTTOKEN;
1312
149k
                        bNextCh = false;
1313
149k
                        break;
1314
149k
                    }
1315
163k
                }
1316
1317
3.56M
                if( IsParserWorking() )
1318
3.56M
                {
1319
3.56M
                    bNextCh = '>' == nNextCh;
1320
3.56M
                    switch( nRet )
1321
3.56M
                    {
1322
2.19k
                    case HtmlTokenId::TEXTAREA_ON:
1323
2.19k
                        bReadTextArea = true;
1324
2.19k
                        break;
1325
1.04k
                    case HtmlTokenId::TEXTAREA_OFF:
1326
1.04k
                        bReadTextArea = false;
1327
1.04k
                        break;
1328
2.44k
                    case HtmlTokenId::SCRIPT_ON:
1329
2.44k
                        if( !bReadTextArea )
1330
2.25k
                            bReadScript = true;
1331
2.44k
                        break;
1332
3.36k
                    case HtmlTokenId::SCRIPT_OFF:
1333
3.36k
                        if( !bReadTextArea )
1334
3.23k
                        {
1335
3.23k
                            bReadScript = false;
1336
                            // JavaScript might modify the stream,
1337
                            // thus the last character has to be read again.
1338
3.23k
                            bReadNextChar = true;
1339
3.23k
                            bNextCh = false;
1340
3.23k
                        }
1341
3.36k
                        break;
1342
1343
13.8k
                    case HtmlTokenId::STYLE_ON:
1344
13.8k
                        bReadStyle = true;
1345
13.8k
                        break;
1346
15.6k
                    case HtmlTokenId::STYLE_OFF:
1347
15.6k
                        bReadStyle = false;
1348
15.6k
                        break;
1349
3.52M
                    default: break;
1350
3.56M
                    }
1351
3.56M
                }
1352
3.56M
            }
1353
3.56M
            break;
1354
1355
3.56M
        case sal_Unicode(EOF):
1356
43.8k
            if( rInput.eof() )
1357
42.7k
            {
1358
42.7k
                eState = SvParserState::Accepted;
1359
42.7k
                nRet = HtmlTokenId(nNextCh);
1360
42.7k
            }
1361
1.11k
            else
1362
1.11k
            {
1363
                // Read normal text.
1364
1.11k
                goto scan_text;
1365
1.11k
            }
1366
42.7k
            break;
1367
1368
127k
        case '\f':
1369
            // form feeds are passed upwards separately
1370
127k
            nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1371
127k
            break;
1372
1373
446k
        case '\n':
1374
595k
        case '\r':
1375
595k
            if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1376
189k
            {
1377
189k
                sal_Unicode c = GetNextChar();
1378
189k
                if( ( '\n' != nNextCh || '\r' != c ) &&
1379
188k
                    ( '\r' != nNextCh || '\n' != c ) )
1380
173k
                {
1381
173k
                    bNextCh = false;
1382
173k
                    nNextCh = c;
1383
173k
                }
1384
189k
                nRet = HtmlTokenId::NEWPARA;
1385
189k
                break;
1386
189k
            }
1387
406k
            [[fallthrough]];
1388
575k
        case '\t':
1389
575k
            if( bReadPRE )
1390
162k
            {
1391
162k
                nRet = HtmlTokenId::TABCHAR;
1392
162k
                break;
1393
162k
            }
1394
412k
            [[fallthrough]];
1395
504k
        case ' ':
1396
504k
            [[fallthrough]];
1397
2.29M
        default:
1398
1399
2.29M
scan_text:
1400
            // "normal" text to come
1401
2.29M
            nRet = ScanText();
1402
2.29M
            bNextCh = 0 == aToken.getLength();
1403
1404
            // the text should be processed
1405
2.29M
            if( !bNextCh && eState == SvParserState::Pending )
1406
0
            {
1407
0
                eState = SvParserState::Working;
1408
0
                bReadNextChar = true;
1409
0
            }
1410
1411
2.29M
            break;
1412
6.56M
        }
1413
1414
6.56M
        if( bNextCh && SvParserState::Working == eState )
1415
4.58M
        {
1416
4.58M
            nNextCh = GetNextChar();
1417
4.58M
            if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1418
0
            {
1419
0
                bReadNextChar = true;
1420
0
                eState = SvParserState::Working;
1421
0
            }
1422
4.58M
        }
1423
1424
6.56M
    } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1425
1426
6.54M
    if( SvParserState::Pending == eState )
1427
0
        nRet = HtmlTokenId::INVALID;      // s.th. invalid
1428
1429
6.54M
    return nRet;
1430
6.54M
}
1431
1432
void HTMLParser::UnescapeToken()
1433
13.2k
{
1434
13.2k
    sal_Int32 nPos=0;
1435
1436
13.2k
    bool bEscape = false;
1437
456k
    while( nPos < aToken.getLength() )
1438
443k
    {
1439
443k
        bool bOldEscape = bEscape;
1440
443k
        bEscape = false;
1441
443k
        if( '\\'==aToken[nPos] && !bOldEscape )
1442
1.07k
        {
1443
1.07k
            aToken.remove( nPos, 1 );
1444
1.07k
            bEscape = true;
1445
1.07k
        }
1446
442k
        else
1447
442k
        {
1448
442k
            nPos++;
1449
442k
        }
1450
443k
    }
1451
13.2k
}
1452
1453
const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1454
2.28M
{
1455
    // If the options for the current token have already been returned,
1456
    // return them once again.
1457
2.28M
    if (!maOptions.empty())
1458
82.9k
        return maOptions;
1459
1460
2.20M
    sal_Int32 nPos = 0;
1461
4.54M
    while( nPos < aToken.getLength() )
1462
2.33M
    {
1463
        // A letter? Option beginning here.
1464
2.33M
        if( rtl::isAsciiAlpha( aToken[nPos] ) )
1465
1.41M
        {
1466
1.41M
            HtmlOptionId nToken;
1467
1.41M
            OUString aValue;
1468
1.41M
            sal_Int32 nStt = nPos;
1469
1.41M
            sal_Unicode cChar = 0;
1470
1471
            // Actually only certain characters allowed.
1472
            // Netscape only looks for "=" and white space (c.f.
1473
            // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1474
9.24M
            while( nPos < aToken.getLength() )
1475
9.03M
            {
1476
9.03M
                cChar = aToken[nPos];
1477
9.03M
                if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1478
1.20M
                    break;
1479
7.83M
                nPos++;
1480
7.83M
            }
1481
1482
1.41M
            OUString sName( aToken.subView( nStt, nPos-nStt ) );
1483
1484
            // PlugIns require original token name. Convert to lower case only for searching.
1485
1.41M
            nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1486
1.41M
            SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
1487
1.41M
                        "GetOption: unknown HTML option '" << sName << "'" );
1488
1.41M
            bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1489
392k
                               nToken >= HtmlOptionId::SCRIPT_END) &&
1490
1.40M
                              (!pNoConvertToken || nToken != *pNoConvertToken);
1491
1492
1.56M
            while( nPos < aToken.getLength() )
1493
1.34M
            {
1494
1.34M
                cChar = aToken[nPos];
1495
1.34M
                if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1496
1.19M
                    break;
1497
153k
                nPos++;
1498
153k
            }
1499
1500
            // Option with value?
1501
1.41M
            if( nPos!=aToken.getLength() && '='==cChar )
1502
1.07M
            {
1503
1.07M
                nPos++;
1504
1505
1.08M
                while( nPos < aToken.getLength() )
1506
1.07M
                {
1507
1.07M
                    cChar = aToken[nPos];
1508
1.07M
                    if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1509
1.07M
                        break;
1510
7.57k
                    nPos++;
1511
7.57k
                }
1512
1513
1.07M
                if( nPos != aToken.getLength() )
1514
1.07M
                {
1515
1.07M
                    sal_Int32 nLen = 0;
1516
1.07M
                    nStt = nPos;
1517
1.07M
                    if( ('"'==cChar) || '\''==cChar )
1518
810k
                    {
1519
810k
                        sal_Unicode cEnd = cChar;
1520
810k
                        nPos++; nStt++;
1521
810k
                        bool bDone = false;
1522
810k
                        bool bEscape = false;
1523
19.0M
                        while( nPos < aToken.getLength() && !bDone )
1524
18.2M
                        {
1525
18.2M
                            bool bOldEscape = bEscape;
1526
18.2M
                            bEscape = false;
1527
18.2M
                            cChar = aToken[nPos];
1528
18.2M
                            switch( cChar )
1529
18.2M
                            {
1530
22.8k
                            case '\r':
1531
94.8k
                            case '\n':
1532
94.8k
                                if( bStripCRLF )
1533
84.9k
                                    aToken.remove( nPos, 1 );
1534
9.86k
                                else
1535
9.86k
                                {
1536
9.86k
                                    nPos++;
1537
9.86k
                                    nLen++;
1538
9.86k
                                }
1539
94.8k
                                break;
1540
51.9k
                            case '\\':
1541
51.9k
                                if( bOldEscape )
1542
20.2k
                                {
1543
20.2k
                                    nPos++;
1544
20.2k
                                    nLen++;
1545
20.2k
                                }
1546
31.6k
                                else
1547
31.6k
                                {
1548
31.6k
                                    aToken.remove( nPos, 1 );
1549
31.6k
                                    bEscape = true;
1550
31.6k
                                }
1551
51.9k
                                break;
1552
814k
                            case '"':
1553
876k
                            case '\'':
1554
876k
                                bDone = !bOldEscape && cChar==cEnd;
1555
876k
                                if( !bDone )
1556
65.7k
                                {
1557
65.7k
                                    nPos++;
1558
65.7k
                                    nLen++;
1559
65.7k
                                }
1560
876k
                                break;
1561
17.1M
                            default:
1562
17.1M
                                nPos++;
1563
17.1M
                                nLen++;
1564
17.1M
                                break;
1565
18.2M
                            }
1566
18.2M
                        }
1567
810k
                        if( nPos!=aToken.getLength() )
1568
810k
                            nPos++;
1569
810k
                    }
1570
260k
                    else
1571
260k
                    {
1572
                        // More liberal than the standard: allow all printable characters
1573
260k
                        bool bEscape = false;
1574
260k
                        bool bDone = false;
1575
2.20M
                        while( nPos < aToken.getLength() && !bDone )
1576
1.94M
                        {
1577
1.94M
                            bool bOldEscape = bEscape;
1578
1.94M
                            bEscape = false;
1579
1.94M
                            sal_Unicode c = aToken[nPos];
1580
1.94M
                            switch( c )
1581
1.94M
                            {
1582
104k
                            case ' ':
1583
104k
                                bDone = !bOldEscape;
1584
104k
                                if( !bDone )
1585
331
                                {
1586
331
                                    nPos++;
1587
331
                                    nLen++;
1588
331
                                }
1589
104k
                                break;
1590
1591
246
                            case '\t':
1592
1.29k
                            case '\r':
1593
2.68k
                            case '\n':
1594
2.68k
                                bDone = true;
1595
2.68k
                                break;
1596
1597
31.6k
                            case '\\':
1598
31.6k
                                if( bOldEscape )
1599
15.6k
                                {
1600
15.6k
                                    nPos++;
1601
15.6k
                                    nLen++;
1602
15.6k
                                }
1603
16.0k
                                else
1604
16.0k
                                {
1605
16.0k
                                    aToken.remove( nPos, 1 );
1606
16.0k
                                    bEscape = true;
1607
16.0k
                                }
1608
31.6k
                                break;
1609
1610
1.80M
                            default:
1611
1.80M
                                if( HTML_ISPRINTABLE( c ) )
1612
1.80M
                                {
1613
1.80M
                                    nPos++;
1614
1.80M
                                    nLen++;
1615
1.80M
                                }
1616
353
                                else
1617
353
                                    bDone = true;
1618
1.80M
                                break;
1619
1.94M
                            }
1620
1.94M
                        }
1621
260k
                    }
1622
1623
1.07M
                    if( nLen )
1624
1.06M
                        aValue = aToken.subView( nStt, nLen );
1625
1.07M
                }
1626
1.07M
            }
1627
1628
            // Token is known and can be saved
1629
1.41M
            maOptions.emplace_back(nToken, sName, aValue);
1630
1631
1.41M
        }
1632
926k
        else
1633
            // Ignore white space and unexpected characters
1634
926k
            nPos++;
1635
2.33M
    }
1636
1637
2.20M
    return maOptions;
1638
2.20M
}
1639
1640
HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
1641
1.00M
{
1642
1.00M
    switch( nToken )
1643
1.00M
    {
1644
    // in Netscape they only have impact in not empty paragraphs
1645
7.34k
    case HtmlTokenId::PARABREAK_ON:
1646
7.34k
        nToken = HtmlTokenId::LINEBREAK;
1647
7.34k
        [[fallthrough]];
1648
13.3k
    case HtmlTokenId::LINEBREAK:
1649
154k
    case HtmlTokenId::NEWPARA:
1650
154k
        nPre_LinePos = 0;
1651
154k
        if( bPre_IgnoreNewPara )
1652
2.50k
            nToken = HtmlTokenId::NONE;
1653
154k
        break;
1654
1655
162k
    case HtmlTokenId::TABCHAR:
1656
162k
        {
1657
162k
            sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1658
162k
            DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1659
162k
            if (aToken.getLength() < nSpaces)
1660
162k
            {
1661
162k
                comphelper::string::padToLength(aToken, nSpaces, ' ');
1662
162k
            }
1663
162k
            nPre_LinePos += nSpaces;
1664
162k
            nToken = HtmlTokenId::TEXTTOKEN;
1665
162k
        }
1666
162k
        break;
1667
    // Keep those
1668
339k
    case HtmlTokenId::TEXTTOKEN:
1669
339k
        nPre_LinePos += aToken.getLength();
1670
339k
        break;
1671
1672
80
    case HtmlTokenId::SELECT_ON:
1673
401
    case HtmlTokenId::SELECT_OFF:
1674
401
    case HtmlTokenId::BODY_ON:
1675
850
    case HtmlTokenId::FORM_ON:
1676
3.24k
    case HtmlTokenId::FORM_OFF:
1677
3.86k
    case HtmlTokenId::INPUT:
1678
3.95k
    case HtmlTokenId::OPTION:
1679
4.20k
    case HtmlTokenId::TEXTAREA_ON:
1680
4.93k
    case HtmlTokenId::TEXTAREA_OFF:
1681
1682
11.1k
    case HtmlTokenId::IMAGE:
1683
11.3k
    case HtmlTokenId::APPLET_ON:
1684
11.6k
    case HtmlTokenId::APPLET_OFF:
1685
11.7k
    case HtmlTokenId::PARAM:
1686
12.3k
    case HtmlTokenId::EMBED:
1687
1688
14.9k
    case HtmlTokenId::HEAD1_ON:
1689
16.7k
    case HtmlTokenId::HEAD1_OFF:
1690
21.4k
    case HtmlTokenId::HEAD2_ON:
1691
23.4k
    case HtmlTokenId::HEAD2_OFF:
1692
24.8k
    case HtmlTokenId::HEAD3_ON:
1693
25.2k
    case HtmlTokenId::HEAD3_OFF:
1694
27.2k
    case HtmlTokenId::HEAD4_ON:
1695
27.9k
    case HtmlTokenId::HEAD4_OFF:
1696
30.0k
    case HtmlTokenId::HEAD5_ON:
1697
30.2k
    case HtmlTokenId::HEAD5_OFF:
1698
31.3k
    case HtmlTokenId::HEAD6_ON:
1699
31.5k
    case HtmlTokenId::HEAD6_OFF:
1700
32.1k
    case HtmlTokenId::BLOCKQUOTE_ON:
1701
32.4k
    case HtmlTokenId::BLOCKQUOTE_OFF:
1702
32.6k
    case HtmlTokenId::ADDRESS_ON:
1703
32.7k
    case HtmlTokenId::ADDRESS_OFF:
1704
36.0k
    case HtmlTokenId::HORZRULE:
1705
1706
39.4k
    case HtmlTokenId::CENTER_ON:
1707
40.2k
    case HtmlTokenId::CENTER_OFF:
1708
49.4k
    case HtmlTokenId::DIVISION_ON:
1709
52.3k
    case HtmlTokenId::DIVISION_OFF:
1710
1711
52.6k
    case HtmlTokenId::SCRIPT_ON:
1712
52.8k
    case HtmlTokenId::SCRIPT_OFF:
1713
63.6k
    case HtmlTokenId::RAWDATA:
1714
1715
71.8k
    case HtmlTokenId::TABLE_ON:
1716
76.2k
    case HtmlTokenId::TABLE_OFF:
1717
77.1k
    case HtmlTokenId::CAPTION_ON:
1718
77.4k
    case HtmlTokenId::CAPTION_OFF:
1719
79.1k
    case HtmlTokenId::COLGROUP_ON:
1720
80.1k
    case HtmlTokenId::COLGROUP_OFF:
1721
84.1k
    case HtmlTokenId::COL_ON:
1722
84.3k
    case HtmlTokenId::COL_OFF:
1723
84.4k
    case HtmlTokenId::THEAD_ON:
1724
84.7k
    case HtmlTokenId::THEAD_OFF:
1725
85.3k
    case HtmlTokenId::TFOOT_ON:
1726
85.6k
    case HtmlTokenId::TFOOT_OFF:
1727
86.4k
    case HtmlTokenId::TBODY_ON:
1728
86.9k
    case HtmlTokenId::TBODY_OFF:
1729
101k
    case HtmlTokenId::TABLEROW_ON:
1730
104k
    case HtmlTokenId::TABLEROW_OFF:
1731
171k
    case HtmlTokenId::TABLEDATA_ON:
1732
194k
    case HtmlTokenId::TABLEDATA_OFF:
1733
200k
    case HtmlTokenId::TABLEHEADER_ON:
1734
201k
    case HtmlTokenId::TABLEHEADER_OFF:
1735
1736
207k
    case HtmlTokenId::ANCHOR_ON:
1737
212k
    case HtmlTokenId::ANCHOR_OFF:
1738
215k
    case HtmlTokenId::BOLD_ON:
1739
217k
    case HtmlTokenId::BOLD_OFF:
1740
220k
    case HtmlTokenId::ITALIC_ON:
1741
222k
    case HtmlTokenId::ITALIC_OFF:
1742
222k
    case HtmlTokenId::STRIKE_ON:
1743
223k
    case HtmlTokenId::STRIKE_OFF:
1744
223k
    case HtmlTokenId::STRIKETHROUGH_ON:
1745
224k
    case HtmlTokenId::STRIKETHROUGH_OFF:
1746
227k
    case HtmlTokenId::UNDERLINE_ON:
1747
228k
    case HtmlTokenId::UNDERLINE_OFF:
1748
228k
    case HtmlTokenId::BASEFONT_ON:
1749
228k
    case HtmlTokenId::BASEFONT_OFF:
1750
230k
    case HtmlTokenId::FONT_ON:
1751
231k
    case HtmlTokenId::FONT_OFF:
1752
232k
    case HtmlTokenId::BLINK_ON:
1753
232k
    case HtmlTokenId::BLINK_OFF:
1754
235k
    case HtmlTokenId::SPAN_ON:
1755
237k
    case HtmlTokenId::SPAN_OFF:
1756
237k
    case HtmlTokenId::SUBSCRIPT_ON:
1757
238k
    case HtmlTokenId::SUBSCRIPT_OFF:
1758
238k
    case HtmlTokenId::SUPERSCRIPT_ON:
1759
238k
    case HtmlTokenId::SUPERSCRIPT_OFF:
1760
239k
    case HtmlTokenId::BIGPRINT_ON:
1761
240k
    case HtmlTokenId::BIGPRINT_OFF:
1762
240k
    case HtmlTokenId::SMALLPRINT_OFF:
1763
241k
    case HtmlTokenId::SMALLPRINT_ON:
1764
1765
249k
    case HtmlTokenId::EMPHASIS_ON:
1766
250k
    case HtmlTokenId::EMPHASIS_OFF:
1767
250k
    case HtmlTokenId::CITATION_ON:
1768
250k
    case HtmlTokenId::CITATION_OFF:
1769
251k
    case HtmlTokenId::STRONG_ON:
1770
251k
    case HtmlTokenId::STRONG_OFF:
1771
253k
    case HtmlTokenId::CODE_ON:
1772
253k
    case HtmlTokenId::CODE_OFF:
1773
254k
    case HtmlTokenId::SAMPLE_ON:
1774
254k
    case HtmlTokenId::SAMPLE_OFF:
1775
255k
    case HtmlTokenId::KEYBOARD_ON:
1776
255k
    case HtmlTokenId::KEYBOARD_OFF:
1777
256k
    case HtmlTokenId::VARIABLE_ON:
1778
257k
    case HtmlTokenId::VARIABLE_OFF:
1779
257k
    case HtmlTokenId::DEFINSTANCE_ON:
1780
257k
    case HtmlTokenId::DEFINSTANCE_OFF:
1781
258k
    case HtmlTokenId::SHORTQUOTE_ON:
1782
258k
    case HtmlTokenId::SHORTQUOTE_OFF:
1783
258k
    case HtmlTokenId::LANGUAGE_ON:
1784
258k
    case HtmlTokenId::LANGUAGE_OFF:
1785
259k
    case HtmlTokenId::AUTHOR_ON:
1786
259k
    case HtmlTokenId::AUTHOR_OFF:
1787
259k
    case HtmlTokenId::PERSON_ON:
1788
259k
    case HtmlTokenId::PERSON_OFF:
1789
260k
    case HtmlTokenId::ACRONYM_ON:
1790
260k
    case HtmlTokenId::ACRONYM_OFF:
1791
260k
    case HtmlTokenId::ABBREVIATION_ON:
1792
260k
    case HtmlTokenId::ABBREVIATION_OFF:
1793
261k
    case HtmlTokenId::INSERTEDTEXT_ON:
1794
261k
    case HtmlTokenId::INSERTEDTEXT_OFF:
1795
262k
    case HtmlTokenId::DELETEDTEXT_ON:
1796
262k
    case HtmlTokenId::DELETEDTEXT_OFF:
1797
264k
    case HtmlTokenId::TELETYPE_ON:
1798
265k
    case HtmlTokenId::TELETYPE_OFF:
1799
1800
265k
        break;
1801
1802
    // The remainder is treated as an unknown token.
1803
83.0k
    default:
1804
83.0k
        if( nToken != HtmlTokenId::NONE )
1805
83.0k
        {
1806
83.0k
            nToken =
1807
83.0k
                ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1808
83.0k
                    ? HtmlTokenId::UNKNOWNCONTROL_OFF
1809
83.0k
                    : HtmlTokenId::UNKNOWNCONTROL_ON );
1810
83.0k
        }
1811
83.0k
        break;
1812
1.00M
    }
1813
1814
1.00M
    bPre_IgnoreNewPara = false;
1815
1816
1.00M
    return nToken;
1817
1.00M
}
1818
1819
HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
1820
130k
{
1821
130k
    switch( nToken )
1822
130k
    {
1823
41.3k
    case HtmlTokenId::NEWPARA:
1824
41.3k
        if( bPre_IgnoreNewPara )
1825
599
            nToken = HtmlTokenId::NONE;
1826
41.3k
        [[fallthrough]];
1827
91.3k
    case HtmlTokenId::TEXTTOKEN:
1828
91.3k
    case HtmlTokenId::NONBREAKSPACE:
1829
91.3k
    case HtmlTokenId::SOFTHYPH:
1830
91.3k
        break;              // kept
1831
1832
38.9k
    default:
1833
38.9k
        if( nToken != HtmlTokenId::NONE )
1834
38.9k
        {
1835
38.9k
            if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
1836
10.9k
            {
1837
10.9k
                sSaveToken = "</" + sSaveToken;
1838
10.9k
            }
1839
27.9k
            else
1840
27.9k
                sSaveToken = "<" + sSaveToken;
1841
38.9k
            if( !aToken.isEmpty() )
1842
13.1k
            {
1843
13.1k
                UnescapeToken();
1844
13.1k
                sSaveToken += " ";
1845
13.1k
                aToken.insert(0, sSaveToken);
1846
13.1k
            }
1847
25.8k
            else
1848
25.8k
                aToken = sSaveToken;
1849
38.9k
            aToken.append( ">" );
1850
38.9k
            nToken = HtmlTokenId::TEXTTOKEN;
1851
38.9k
        }
1852
38.9k
        break;
1853
130k
    }
1854
1855
130k
    bPre_IgnoreNewPara = false;
1856
1857
130k
    return nToken;
1858
130k
}
1859
1860
HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
1861
20.3k
{
1862
20.3k
    switch( nToken )
1863
20.3k
    {
1864
6.13k
    case HtmlTokenId::NEWPARA:
1865
6.13k
        if( bPre_IgnoreNewPara )
1866
243
            nToken = HtmlTokenId::NONE;
1867
6.13k
        [[fallthrough]];
1868
13.5k
    case HtmlTokenId::TEXTTOKEN:
1869
14.0k
    case HtmlTokenId::NONBREAKSPACE:
1870
14.4k
    case HtmlTokenId::SOFTHYPH:
1871
14.4k
        break;      // kept
1872
1873
5.89k
    default:
1874
5.89k
        if( nToken != HtmlTokenId::NONE )
1875
5.89k
        {
1876
5.89k
            nToken =
1877
5.89k
                ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1878
5.89k
                    ? HtmlTokenId::UNKNOWNCONTROL_OFF
1879
5.89k
                    : HtmlTokenId::UNKNOWNCONTROL_ON );
1880
5.89k
        }
1881
5.89k
        break;
1882
20.3k
    }
1883
1884
20.3k
    bPre_IgnoreNewPara = false;
1885
1886
20.3k
    return nToken;
1887
20.3k
}
1888
1889
bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1890
51.8k
{
1891
51.8k
    bool bFound = false;
1892
1893
51.8k
    if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1894
0
    {
1895
0
        OUString aName( rURL.copy(14) );
1896
0
        switch( aName[0] )
1897
0
        {
1898
0
        case 'b':
1899
0
            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1900
0
            break;
1901
0
        case 'd':
1902
0
            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1903
0
            break;
1904
0
        case 'e':
1905
0
            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1906
0
            break;
1907
0
        case 'i':
1908
0
            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1909
0
            break;
1910
0
        case 'n':
1911
0
            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1912
0
            break;
1913
0
        }
1914
0
    }
1915
51.8k
    if( bFound )
1916
0
    {
1917
0
        OUString sTmp ( rURL );
1918
0
        rURL =  OOO_STRING_SVTOOLS_HTML_private_image;
1919
0
        rURL += sTmp;
1920
0
    }
1921
1922
51.8k
    return bFound;
1923
51.8k
}
1924
1925
namespace {
1926
1927
enum class HtmlMeta {
1928
    NONE = 0,
1929
    Author,
1930
    Description,
1931
    Keywords,
1932
    Refresh,
1933
    Classification,
1934
    Created,
1935
    ChangedBy,
1936
    Changed,
1937
    Generator,
1938
    SDFootnote,
1939
    SDEndnote,
1940
    ContentType
1941
};
1942
1943
}
1944
1945
// <META NAME=xxx>
1946
HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
1947
{
1948
    { OOO_STRING_SVTOOLS_HTML_META_author,        HtmlMeta::Author        },
1949
    { OOO_STRING_SVTOOLS_HTML_META_changed,       HtmlMeta::Changed       },
1950
    { OOO_STRING_SVTOOLS_HTML_META_changedby,     HtmlMeta::ChangedBy     },
1951
    { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1952
    { OOO_STRING_SVTOOLS_HTML_META_content_type,  HtmlMeta::ContentType   },
1953
    { OOO_STRING_SVTOOLS_HTML_META_created,       HtmlMeta::Created       },
1954
    { OOO_STRING_SVTOOLS_HTML_META_description,   HtmlMeta::Description   },
1955
    { OOO_STRING_SVTOOLS_HTML_META_keywords,      HtmlMeta::Keywords      },
1956
    { OOO_STRING_SVTOOLS_HTML_META_generator,     HtmlMeta::Generator     },
1957
    { OOO_STRING_SVTOOLS_HTML_META_refresh,       HtmlMeta::Refresh       },
1958
    { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HtmlMeta::SDEndnote     },
1959
    { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HtmlMeta::SDFootnote    },
1960
    { nullptr,                                    HtmlMeta(0)             }
1961
};
1962
1963
1964
void HTMLParser::AddMetaUserDefined( OUString const & )
1965
0
{
1966
0
}
1967
1968
bool HTMLParser::ParseMetaOptionsImpl(
1969
        const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1970
        SvKeyValueIterator *i_pHTTPHeader,
1971
        const HTMLOptions& aOptions,
1972
        rtl_TextEncoding& o_rEnc )
1973
63.1k
{
1974
63.1k
    OUString aName, aContent;
1975
63.1k
    HtmlMeta nAction = HtmlMeta::NONE;
1976
63.1k
    bool bHTTPEquiv = false, bChanged = false;
1977
1978
171k
    for ( size_t i = aOptions.size(); i; )
1979
108k
    {
1980
108k
        const HTMLOption& aOption = aOptions[--i];
1981
108k
        switch ( aOption.GetToken() )
1982
108k
        {
1983
37.5k
            case HtmlOptionId::NAME:
1984
37.5k
                aName = aOption.GetString();
1985
37.5k
                if ( HtmlMeta::NONE==nAction )
1986
37.3k
                {
1987
37.3k
                    aOption.GetEnum( nAction, aHTMLMetaNameTable );
1988
37.3k
                }
1989
37.5k
                break;
1990
5.92k
            case HtmlOptionId::HTTPEQUIV:
1991
5.92k
                aName = aOption.GetString();
1992
5.92k
                aOption.GetEnum( nAction, aHTMLMetaNameTable );
1993
5.92k
                bHTTPEquiv = true;
1994
5.92k
                break;
1995
35.2k
            case HtmlOptionId::CONTENT:
1996
35.2k
                aContent = aOption.GetString();
1997
35.2k
                break;
1998
1.05k
            case HtmlOptionId::CHARSET:
1999
1.05k
            {
2000
1.05k
                OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
2001
1.05k
                o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
2002
1.05k
                break;
2003
0
            }
2004
28.5k
            default: break;
2005
108k
        }
2006
108k
    }
2007
2008
63.1k
    if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2009
63.0k
    {
2010
        // if it is not a Description, remove CRs and LFs from CONTENT
2011
63.0k
        aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2012
63.0k
    }
2013
178
    else
2014
178
    {
2015
        // convert line endings for Description
2016
178
        aContent = convertLineEnd(aContent, GetSystemLineEnd());
2017
178
    }
2018
2019
63.1k
    if ( bHTTPEquiv && i_pHTTPHeader )
2020
5.86k
    {
2021
        // Netscape seems to just ignore a closing ", so we do too
2022
5.86k
        if ( aContent.endsWith("\"") )
2023
70
        {
2024
70
            aContent = aContent.copy( 0, aContent.getLength() - 1 );
2025
70
        }
2026
5.86k
        SvKeyValue aKeyValue( aName, aContent );
2027
5.86k
        i_pHTTPHeader->Append( aKeyValue );
2028
5.86k
    }
2029
2030
63.1k
    switch ( nAction )
2031
63.1k
    {
2032
1.47k
        case HtmlMeta::Author:
2033
1.47k
            if (i_xDocProps.is()) {
2034
1.47k
                i_xDocProps->setAuthor( aContent );
2035
1.47k
                bChanged = true;
2036
1.47k
            }
2037
1.47k
            break;
2038
178
        case HtmlMeta::Description:
2039
178
            if (i_xDocProps.is()) {
2040
178
                i_xDocProps->setDescription( aContent );
2041
178
                bChanged = true;
2042
178
            }
2043
178
            break;
2044
3.23k
        case HtmlMeta::Keywords:
2045
3.23k
            if (i_xDocProps.is()) {
2046
3.23k
                i_xDocProps->setKeywords(
2047
3.23k
                    ::comphelper::string::convertCommaSeparated(aContent));
2048
3.23k
                bChanged = true;
2049
3.23k
            }
2050
3.23k
            break;
2051
14
        case HtmlMeta::Classification:
2052
14
            if (i_xDocProps.is()) {
2053
14
                i_xDocProps->setSubject( aContent );
2054
14
                bChanged = true;
2055
14
            }
2056
14
            break;
2057
2058
1.89k
        case HtmlMeta::ChangedBy:
2059
1.89k
            if (i_xDocProps.is()) {
2060
1.89k
                i_xDocProps->setModifiedBy( aContent );
2061
1.89k
                bChanged = true;
2062
1.89k
            }
2063
1.89k
            break;
2064
2065
4.06k
        case HtmlMeta::Created:
2066
15.0k
        case HtmlMeta::Changed:
2067
15.0k
            if (i_xDocProps.is() && !aContent.isEmpty())
2068
14.1k
            {
2069
14.1k
                ::util::DateTime uDT;
2070
14.1k
                bool valid = false;
2071
14.1k
                if (comphelper::string::getTokenCount(aContent, ';') == 2)
2072
4.76k
                {
2073
4.76k
                    sal_Int32 nIdx{ 0 };
2074
4.76k
                    sal_Int32 nDate = o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx));
2075
4.76k
                    sal_Int64 nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2076
4.76k
                    valid = nDate != std::numeric_limits<sal_Int32>::min() &&
2077
4.76k
                            nTime != std::numeric_limits<sal_Int64>::min();
2078
4.76k
                    if (valid)
2079
4.76k
                    {
2080
4.76k
                        Date aDate(nDate);
2081
4.76k
                        tools::Time aTime(tools::Time::fromEncodedTime(nTime));
2082
4.76k
                        uDT = DateTime(aDate, aTime).GetUNODateTime();
2083
4.76k
                    }
2084
4.76k
                }
2085
9.41k
                else if (utl::ISO8601parseDateTime(aContent, uDT))
2086
4.65k
                    valid = true;
2087
2088
14.1k
                if (valid)
2089
9.41k
                {
2090
9.41k
                    bChanged = true;
2091
9.41k
                    if (HtmlMeta::Created == nAction)
2092
1.37k
                        i_xDocProps->setCreationDate(uDT);
2093
8.03k
                    else
2094
8.03k
                        i_xDocProps->setModificationDate(uDT);
2095
9.41k
                }
2096
14.1k
            }
2097
15.0k
            break;
2098
2099
2
        case HtmlMeta::Refresh:
2100
2
            DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2101
2
            break;
2102
2103
4.67k
        case HtmlMeta::ContentType:
2104
4.67k
            if ( !aContent.isEmpty() )
2105
4.32k
            {
2106
4.32k
                o_rEnc = GetEncodingByMIME( aContent );
2107
4.32k
            }
2108
4.67k
            break;
2109
2110
32.5k
        case HtmlMeta::NONE:
2111
32.5k
            if ( !bHTTPEquiv )
2112
31.3k
            {
2113
31.3k
                if (i_xDocProps.is())
2114
31.3k
                {
2115
31.3k
                    uno::Reference<beans::XPropertyContainer> xUDProps
2116
31.3k
                        = i_xDocProps->getUserDefinedProperties();
2117
31.3k
                    try {
2118
31.3k
                        xUDProps->addProperty(aName,
2119
31.3k
                            beans::PropertyAttribute::REMOVABLE,
2120
31.3k
                            uno::Any(aContent));
2121
31.3k
                        AddMetaUserDefined(aName);
2122
31.3k
                        bChanged = true;
2123
31.3k
                    } catch (uno::Exception &) {
2124
                        // ignore
2125
23.4k
                    }
2126
31.3k
                }
2127
31.3k
            }
2128
32.5k
            break;
2129
32.5k
        default:
2130
4.12k
            break;
2131
63.1k
    }
2132
2133
63.1k
    return bChanged;
2134
63.1k
}
2135
2136
bool HTMLParser::ParseMetaOptions(
2137
        const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2138
        SvKeyValueIterator *i_pHeader )
2139
63.1k
{
2140
63.1k
    HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2141
63.1k
    rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2142
2143
63.1k
    bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2144
63.1k
                      GetOptions(&nContentOption),
2145
63.1k
                      eEnc );
2146
2147
    // If the encoding is set by a META tag, it may only overwrite the
2148
    // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2149
    // encodings. Everything else cannot lead to reasonable results.
2150
63.1k
    if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2151
3.66k
        rtl_isOctetTextEncoding( eEnc ) &&
2152
3.66k
        rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2153
3.19k
    {
2154
3.19k
        eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2155
3.19k
        SetSrcEncoding( eEnc );
2156
3.19k
    }
2157
2158
63.1k
    return bRet;
2159
63.1k
}
2160
2161
rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2162
42.3k
{
2163
42.3k
    OUString sType;
2164
42.3k
    OUString sSubType;
2165
42.3k
    INetContentTypeParameterList aParameters;
2166
42.3k
    if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2167
34.9k
    {
2168
34.9k
        auto const iter = aParameters.find("charset"_ostr);
2169
34.9k
        if (iter != aParameters.end())
2170
33.8k
        {
2171
33.8k
            const INetContentTypeParameter * pCharset = &iter->second;
2172
33.8k
            OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2173
33.8k
            return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2174
33.8k
        }
2175
34.9k
    }
2176
8.51k
    return RTL_TEXTENCODING_DONTKNOW;
2177
42.3k
}
2178
2179
rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2180
42.7k
{
2181
42.7k
    rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2182
42.7k
    if( pHTTPHeader )
2183
42.7k
    {
2184
42.7k
        SvKeyValue aKV;
2185
71.1k
        for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2186
42.7k
             bCont = pHTTPHeader->GetNext( aKV ) )
2187
28.3k
        {
2188
28.3k
            if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2189
28.3k
            {
2190
28.3k
                if( !aKV.GetValue().isEmpty() )
2191
28.3k
                {
2192
28.3k
                    eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2193
28.3k
                }
2194
28.3k
            }
2195
28.3k
        }
2196
42.7k
    }
2197
42.7k
    return eRet;
2198
42.7k
}
2199
2200
bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2201
42.7k
{
2202
42.7k
    bool bRet = false;
2203
42.7k
    rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2204
42.7k
    if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2205
28.3k
    {
2206
28.3k
        SetSrcEncoding( eEnc );
2207
28.3k
        bRet = true;
2208
28.3k
    }
2209
42.7k
    return bRet;
2210
42.7k
}
2211
2212
2213
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */