/src/libreoffice/svtools/source/svhtml/parhtml.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <comphelper/string.hxx> |
21 | | #include <o3tl/safeint.hxx> |
22 | | #include <o3tl/string_view.hxx> |
23 | | #include <tools/stream.hxx> |
24 | | #include <tools/debug.hxx> |
25 | | #include <tools/color.hxx> |
26 | | #include <rtl/ustrbuf.hxx> |
27 | | #include <rtl/character.hxx> |
28 | | #include <rtl/tencinfo.h> |
29 | | #include <sal/log.hxx> |
30 | | #include <tools/tenccvt.hxx> |
31 | | #include <tools/datetime.hxx> |
32 | | #include <unotools/datetime.hxx> |
33 | | #include <svl/inettype.hxx> |
34 | | #include <svl/lngmisc.hxx> |
35 | | #include <com/sun/star/beans/PropertyAttribute.hpp> |
36 | | #include <com/sun/star/document/XDocumentProperties.hpp> |
37 | | |
38 | | #include <svtools/parhtml.hxx> |
39 | | #include <svtools/htmltokn.h> |
40 | | #include <svtools/htmlkywd.hxx> |
41 | | |
42 | | #include <utility> |
43 | | |
44 | | using namespace ::com::sun::star; |
45 | | |
46 | | |
47 | | const sal_Int32 MAX_LEN( 1024 ); |
48 | | |
49 | | const sal_Int32 MAX_ENTITY_LEN( 8 ); |
50 | | |
51 | | |
52 | | // Tables to convert option values into strings |
53 | | |
54 | | // <INPUT TYPE=xxx> |
55 | | HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] = |
56 | | { |
57 | | { OOO_STRING_SVTOOLS_HTML_IT_text, HTMLInputType::Text }, |
58 | | { OOO_STRING_SVTOOLS_HTML_IT_password, HTMLInputType::Password }, |
59 | | { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTMLInputType::Checkbox }, |
60 | | { OOO_STRING_SVTOOLS_HTML_IT_radio, HTMLInputType::Radio }, |
61 | | { OOO_STRING_SVTOOLS_HTML_IT_range, HTMLInputType::Range }, |
62 | | { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTMLInputType::Scribble }, |
63 | | { OOO_STRING_SVTOOLS_HTML_IT_file, HTMLInputType::File }, |
64 | | { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTMLInputType::Hidden }, |
65 | | { OOO_STRING_SVTOOLS_HTML_IT_submit, HTMLInputType::Submit }, |
66 | | { OOO_STRING_SVTOOLS_HTML_IT_image, HTMLInputType::Image }, |
67 | | { OOO_STRING_SVTOOLS_HTML_IT_reset, HTMLInputType::Reset }, |
68 | | { OOO_STRING_SVTOOLS_HTML_IT_button, HTMLInputType::Button }, |
69 | | { nullptr, HTMLInputType(0) } |
70 | | }; |
71 | | |
72 | | // <TABLE FRAME=xxx> |
73 | | HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] = |
74 | | { |
75 | | { OOO_STRING_SVTOOLS_HTML_TF_void, HTMLTableFrame::Void }, |
76 | | { OOO_STRING_SVTOOLS_HTML_TF_above, HTMLTableFrame::Above }, |
77 | | { OOO_STRING_SVTOOLS_HTML_TF_below, HTMLTableFrame::Below }, |
78 | | { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTMLTableFrame::HSides }, |
79 | | { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTMLTableFrame::LHS }, |
80 | | { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTMLTableFrame::RHS }, |
81 | | { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTMLTableFrame::VSides }, |
82 | | { OOO_STRING_SVTOOLS_HTML_TF_box, HTMLTableFrame::Box }, |
83 | | { OOO_STRING_SVTOOLS_HTML_TF_border, HTMLTableFrame::Box }, |
84 | | { nullptr, HTMLTableFrame(0) } |
85 | | }; |
86 | | |
87 | | // <TABLE RULES=xxx> |
88 | | HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] = |
89 | | { |
90 | | { OOO_STRING_SVTOOLS_HTML_TR_none, HTMLTableRules::NONE }, |
91 | | { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups }, |
92 | | { OOO_STRING_SVTOOLS_HTML_TR_rows, HTMLTableRules::Rows }, |
93 | | { OOO_STRING_SVTOOLS_HTML_TR_cols, HTMLTableRules::Cols }, |
94 | | { OOO_STRING_SVTOOLS_HTML_TR_all, HTMLTableRules::All }, |
95 | | { nullptr, HTMLTableRules(0) } |
96 | | }; |
97 | | |
98 | | |
99 | | HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken, |
100 | | OUString _aValue ) |
101 | 1.41M | : aValue(std::move(_aValue)) |
102 | 1.41M | , aToken(std::move(_aToken)) |
103 | 1.41M | , nToken( nTok ) |
104 | 1.41M | { |
105 | 1.41M | DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END, |
106 | 1.41M | "HTMLOption: unknown token" ); |
107 | 1.41M | } |
108 | | |
109 | | sal_uInt32 HTMLOption::GetNumber() const |
110 | 94.4k | { |
111 | 94.4k | DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && |
112 | 94.4k | nToken<HtmlOptionId::NUMBER_END) || |
113 | 94.4k | (nToken>=HtmlOptionId::CONTEXT_START && |
114 | 94.4k | nToken<HtmlOptionId::CONTEXT_END) || |
115 | 94.4k | nToken==HtmlOptionId::VALUE, |
116 | 94.4k | "GetNumber: Option not numerical" ); |
117 | 94.4k | OUString aTmp(comphelper::string::stripStart(aValue, ' ')); |
118 | 94.4k | sal_Int32 nTmp = aTmp.toInt32(); |
119 | 94.4k | return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0; |
120 | 94.4k | } |
121 | | |
122 | | sal_Int32 HTMLOption::GetSNumber() const |
123 | 1.43k | { |
124 | 1.43k | DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) || |
125 | 1.43k | (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END), |
126 | 1.43k | "GetSNumber: Option not numerical" ); |
127 | 1.43k | OUString aTmp(comphelper::string::stripStart(aValue, ' ')); |
128 | 1.43k | return aTmp.toInt32(); |
129 | 1.43k | } |
130 | | |
131 | | void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const |
132 | 2.05k | { |
133 | 2.05k | rNumbers.clear(); |
134 | | |
135 | | // This is a very simplified scanner: it only searches all |
136 | | // numerals in the string. |
137 | 2.05k | bool bInNum = false; |
138 | 2.05k | sal_uInt32 nNum = 0; |
139 | 30.2k | for( sal_Int32 i=0; i<aValue.getLength(); i++ ) |
140 | 28.1k | { |
141 | 28.1k | sal_Unicode c = aValue[ i ]; |
142 | 28.1k | if( c>='0' && c<='9' ) |
143 | 16.1k | { |
144 | 16.1k | nNum *= 10; |
145 | 16.1k | nNum += (c - '0'); |
146 | 16.1k | bInNum = true; |
147 | 16.1k | } |
148 | 12.0k | else if( bInNum ) |
149 | 7.28k | { |
150 | 7.28k | rNumbers.push_back( nNum ); |
151 | 7.28k | bInNum = false; |
152 | 7.28k | nNum = 0; |
153 | 7.28k | } |
154 | 28.1k | } |
155 | 2.05k | if( bInNum ) |
156 | 1.82k | { |
157 | 1.82k | rNumbers.push_back( nNum ); |
158 | 1.82k | } |
159 | 2.05k | } |
160 | | |
161 | | void HTMLOption::GetColor( Color& rColor ) const |
162 | 18.1k | { |
163 | 18.1k | DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE, |
164 | 18.1k | "GetColor: Option is not a color." ); |
165 | | |
166 | 18.1k | OUString aTmp(aValue.toAsciiLowerCase()); |
167 | 18.1k | sal_uInt32 nColor = SAL_MAX_UINT32; |
168 | 18.1k | if (!aTmp.isEmpty() && aTmp[0] != '#') |
169 | 5.82k | nColor = GetHTMLColor(aTmp); |
170 | | |
171 | 18.1k | if( SAL_MAX_UINT32 == nColor ) |
172 | 15.5k | { |
173 | 15.5k | nColor = 0; |
174 | 15.5k | sal_Int32 nPos = 0; |
175 | 108k | for (sal_uInt32 i=0; i<6; ++i) |
176 | 93.0k | { |
177 | | // Whatever Netscape does to get color values, |
178 | | // at maximum three characters < '0' are ignored. |
179 | 93.0k | sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0'; |
180 | 93.0k | if( c < '0' ) |
181 | 13.6k | { |
182 | 13.6k | c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0'; |
183 | 13.6k | if( c < '0' ) |
184 | 1.27k | c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0'; |
185 | 13.6k | } |
186 | 93.0k | nColor *= 16; |
187 | 93.0k | if( c >= '0' && c <= '9' ) |
188 | 47.0k | nColor += (c - '0'); |
189 | 46.0k | else if( c >= 'a' && c <= 'f' ) |
190 | 36.0k | nColor += (c + 0xa - 'a'); |
191 | 93.0k | } |
192 | 15.5k | } |
193 | | |
194 | 18.1k | rColor.SetRed( static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) ); |
195 | 18.1k | rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8)); |
196 | 18.1k | rColor.SetBlue( static_cast<sal_uInt8>(nColor & 0x000000ff) ); |
197 | 18.1k | } |
198 | | |
199 | | HTMLInputType HTMLOption::GetInputType() const |
200 | 0 | { |
201 | 0 | DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" ); |
202 | 0 | return GetEnum( aInputTypeOptEnums, HTMLInputType::Text ); |
203 | 0 | } |
204 | | |
205 | | HTMLTableFrame HTMLOption::GetTableFrame() const |
206 | 3.88k | { |
207 | 3.88k | DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" ); |
208 | 3.88k | return GetEnum( aTableFrameOptEnums ); |
209 | 3.88k | } |
210 | | |
211 | | HTMLTableRules HTMLOption::GetTableRules() const |
212 | 766 | { |
213 | 766 | DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" ); |
214 | 766 | return GetEnum( aTableRulesOptEnums ); |
215 | 766 | } |
216 | | |
217 | | HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) : |
218 | 42.7k | SvParser<HtmlTokenId>( rIn ), |
219 | 42.7k | bNewDoc(bReadNewDoc), |
220 | 42.7k | bIsInHeader(true), |
221 | 42.7k | bReadListing(false), |
222 | 42.7k | bReadXMP(false), |
223 | 42.7k | bReadPRE(false), |
224 | 42.7k | bReadTextArea(false), |
225 | 42.7k | bReadScript(false), |
226 | 42.7k | bReadStyle(false), |
227 | 42.7k | bEndTokenFound(false), |
228 | 42.7k | bPre_IgnoreNewPara(false), |
229 | 42.7k | bReadNextChar(false), |
230 | 42.7k | bReadComment(false), |
231 | 42.7k | nPre_LinePos(0), |
232 | 42.7k | mnPendingOffToken(HtmlTokenId::NONE) |
233 | 42.7k | { |
234 | | //#i76649, default to UTF-8 for HTML unless we know differently |
235 | 42.7k | SetSrcEncoding(RTL_TEXTENCODING_UTF8); |
236 | 42.7k | } |
237 | | |
238 | | HTMLParser::~HTMLParser() |
239 | 42.7k | { |
240 | 42.7k | } |
241 | | |
242 | | void HTMLParser::SetNamespace(std::u16string_view rNamespace) |
243 | 0 | { |
244 | | // Convert namespace alias to a prefix. |
245 | 0 | maNamespace = OUString::Concat(rNamespace) + ":"; |
246 | 0 | } |
247 | | |
248 | | namespace |
249 | | { |
250 | | class RefGuard |
251 | | { |
252 | | private: |
253 | | HTMLParser& m_rParser; |
254 | | public: |
255 | | RefGuard(HTMLParser& rParser) |
256 | 42.7k | : m_rParser(rParser) |
257 | 42.7k | { |
258 | 42.7k | m_rParser.AddFirstRef(); |
259 | 42.7k | } |
260 | | |
261 | | ~RefGuard() |
262 | 42.7k | { |
263 | 42.7k | if (m_rParser.GetStatus() != SvParserState::Pending) |
264 | 42.7k | m_rParser.ReleaseRef(); // Parser not needed anymore |
265 | 42.7k | } |
266 | | }; |
267 | | } |
268 | | |
269 | | SvParserState HTMLParser::CallParser() |
270 | 42.7k | { |
271 | 42.7k | eState = SvParserState::Working; |
272 | 42.7k | nNextCh = GetNextChar(); |
273 | 42.7k | SaveState( HtmlTokenId::NONE ); |
274 | | |
275 | 42.7k | nPre_LinePos = 0; |
276 | 42.7k | bPre_IgnoreNewPara = false; |
277 | | |
278 | 42.7k | RefGuard aRefGuard(*this); |
279 | | |
280 | 42.7k | Continue( HtmlTokenId::NONE ); |
281 | | |
282 | 42.7k | return eState; |
283 | 42.7k | } |
284 | | |
285 | | void HTMLParser::Continue( HtmlTokenId nToken ) |
286 | 42.7k | { |
287 | 42.7k | if( nToken == HtmlTokenId::NONE ) |
288 | 42.7k | nToken = GetNextToken(); |
289 | | |
290 | 4.83M | while( IsParserWorking() ) |
291 | 4.79M | { |
292 | 4.79M | SaveState( nToken ); |
293 | 4.79M | nToken = FilterToken( nToken ); |
294 | | |
295 | 4.79M | if( nToken != HtmlTokenId::NONE ) |
296 | 4.79M | NextToken( nToken ); |
297 | | |
298 | 4.79M | if( IsParserWorking() ) |
299 | 4.78M | SaveState( HtmlTokenId::NONE ); // continue with new token |
300 | | |
301 | 4.79M | nToken = GetNextToken(); |
302 | 4.79M | } |
303 | 42.7k | } |
304 | | |
305 | | HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken ) |
306 | 7.19M | { |
307 | 7.19M | switch( nToken ) |
308 | 7.19M | { |
309 | 0 | case HtmlTokenId(EOF): |
310 | 0 | nToken = HtmlTokenId::NONE; |
311 | 0 | break; // don't pass |
312 | | |
313 | 13.8k | case HtmlTokenId::HEAD_OFF: |
314 | 13.8k | bIsInHeader = false; |
315 | 13.8k | break; |
316 | | |
317 | 14.4k | case HtmlTokenId::HEAD_ON: |
318 | 14.4k | bIsInHeader = true; |
319 | 14.4k | break; |
320 | | |
321 | 17.2k | case HtmlTokenId::BODY_ON: |
322 | 17.2k | bIsInHeader = false; |
323 | 17.2k | break; |
324 | | |
325 | 804 | case HtmlTokenId::FRAMESET_ON: |
326 | 804 | bIsInHeader = false; |
327 | 804 | break; |
328 | | |
329 | 6.17k | case HtmlTokenId::BODY_OFF: |
330 | 6.17k | bReadPRE = bReadListing = bReadXMP = false; |
331 | 6.17k | break; |
332 | | |
333 | 3.91k | case HtmlTokenId::HTML_OFF: |
334 | 3.91k | nToken = HtmlTokenId::NONE; |
335 | 3.91k | bReadPRE = bReadListing = bReadXMP = false; |
336 | 3.91k | break; // HtmlTokenId::ON hasn't been passed either ! |
337 | | |
338 | 34.6k | case HtmlTokenId::PREFORMTXT_ON: |
339 | 34.6k | StartPRE(); |
340 | 34.6k | break; |
341 | | |
342 | 2.03k | case HtmlTokenId::PREFORMTXT_OFF: |
343 | 2.03k | FinishPRE(); |
344 | 2.03k | break; |
345 | | |
346 | 938 | case HtmlTokenId::LISTING_ON: |
347 | 938 | StartListing(); |
348 | 938 | break; |
349 | | |
350 | 335 | case HtmlTokenId::LISTING_OFF: |
351 | 335 | FinishListing(); |
352 | 335 | break; |
353 | | |
354 | 8.51k | case HtmlTokenId::XMP_ON: |
355 | 8.51k | StartXMP(); |
356 | 8.51k | break; |
357 | | |
358 | 314 | case HtmlTokenId::XMP_OFF: |
359 | 314 | FinishXMP(); |
360 | 314 | break; |
361 | | |
362 | 7.08M | default: |
363 | 7.08M | if( bReadPRE ) |
364 | 1.00M | nToken = FilterPRE( nToken ); |
365 | 6.08M | else if( bReadListing ) |
366 | 20.3k | nToken = FilterListing( nToken ); |
367 | 6.06M | else if( bReadXMP ) |
368 | 130k | nToken = FilterXMP( nToken ); |
369 | | |
370 | 7.08M | break; |
371 | 7.19M | } |
372 | | |
373 | 7.19M | return nToken; |
374 | 7.19M | } |
375 | | |
376 | | namespace { |
377 | | |
378 | 12.1M | constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; } |
379 | | |
380 | | constexpr bool HTML_ISSPACE(sal_uInt32 c) |
381 | 4.14M | { |
382 | 4.14M | return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c; |
383 | 4.14M | } |
384 | | |
385 | | } |
386 | | |
387 | | HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak) |
388 | 3.61M | { |
389 | 3.61M | OUStringBuffer sTmpBuffer( MAX_LEN ); |
390 | 3.61M | bool bContinue = true; |
391 | 3.61M | bool bEqSignFound = false; |
392 | 3.61M | sal_uInt32 cQuote = 0U; |
393 | | |
394 | 33.6M | while( bContinue && IsParserWorking() ) |
395 | 30.0M | { |
396 | 30.0M | bool bNextCh = true; |
397 | 30.0M | switch( nNextCh ) |
398 | 30.0M | { |
399 | 180k | case '&': |
400 | 180k | bEqSignFound = false; |
401 | 180k | if( bReadXMP ) |
402 | 4.13k | sTmpBuffer.append( '&' ); |
403 | 176k | else |
404 | 176k | { |
405 | 176k | sal_uInt64 nStreamPos = rInput.Tell(); |
406 | 176k | sal_uInt32 nLinePos = GetLinePos(); |
407 | | |
408 | 176k | sal_uInt32 cChar = 0U; |
409 | 176k | if( '#' == (nNextCh = GetNextChar()) ) |
410 | 45.2k | { |
411 | 45.2k | nNextCh = GetNextChar(); |
412 | 45.2k | const bool bIsHex( 'x' == nNextCh ); |
413 | 45.2k | const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) ); |
414 | 45.2k | if ( bIsDecOrHex ) |
415 | 40.3k | { |
416 | 40.3k | if ( bIsHex ) |
417 | 9.73k | { |
418 | 9.73k | nNextCh = GetNextChar(); |
419 | 32.2k | while ( rtl::isAsciiHexDigit(nNextCh) ) |
420 | 22.4k | { |
421 | 22.4k | cChar = cChar * 16U + |
422 | 22.4k | ( nNextCh <= '9' |
423 | 22.4k | ? sal_uInt32( nNextCh - '0' ) |
424 | 22.4k | : ( nNextCh <= 'F' |
425 | 20.2k | ? sal_uInt32( nNextCh - 'A' + 10 ) |
426 | 20.2k | : sal_uInt32( nNextCh - 'a' + 10 ) ) ); |
427 | 22.4k | nNextCh = GetNextChar(); |
428 | 22.4k | } |
429 | 9.73k | } |
430 | 30.5k | else |
431 | 30.5k | { |
432 | 30.5k | do |
433 | 116k | { |
434 | 116k | cChar = cChar * 10U + sal_uInt32( nNextCh - '0'); |
435 | 116k | nNextCh = GetNextChar(); |
436 | 116k | } |
437 | 116k | while( rtl::isAsciiDigit(nNextCh) ); |
438 | 30.5k | } |
439 | | |
440 | 40.3k | if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc && |
441 | 38.2k | RTL_TEXTENCODING_UCS2 != eSrcEnc && |
442 | 38.1k | RTL_TEXTENCODING_UTF8 != eSrcEnc && |
443 | 11.0k | cChar < 256 ) |
444 | 6.06k | { |
445 | 6.06k | const sal_uInt32 convertFlags = |
446 | 6.06k | RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | |
447 | 6.06k | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | |
448 | 6.06k | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT; |
449 | | |
450 | 6.06k | char cEncodedChar = static_cast<char>(cChar); |
451 | 6.06k | cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar(); |
452 | 6.06k | if( 0U == cChar ) |
453 | 346 | { |
454 | | // If the character could not be |
455 | | // converted, because a conversion is not |
456 | | // available, do no conversion at all. |
457 | 346 | cChar = cEncodedChar; |
458 | 346 | } |
459 | 6.06k | } |
460 | 40.3k | } |
461 | 4.92k | else |
462 | 4.92k | nNextCh = 0U; |
463 | | |
464 | 45.2k | if (!rtl::isUnicodeCodePoint(cChar) |
465 | 44.1k | || (linguistic::IsControlChar(cChar) |
466 | 20.9k | && cChar != '\r' && cChar != '\n' && cChar != '\t')) |
467 | 9.19k | { |
468 | 9.19k | cChar = '?'; |
469 | 9.19k | } |
470 | 45.2k | } |
471 | 130k | else if( rtl::isAsciiAlpha( nNextCh ) ) |
472 | 96.7k | { |
473 | 96.7k | OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN ); |
474 | 96.7k | sal_Int32 nPos = 0; |
475 | 96.7k | do |
476 | 336k | { |
477 | 336k | sEntityBuffer.appendUtf32( nNextCh ); |
478 | 336k | nPos++; |
479 | 336k | nNextCh = GetNextChar(); |
480 | 336k | } |
481 | 336k | while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) && |
482 | 239k | !rInput.eof() ); |
483 | | |
484 | 96.7k | if( IsParserWorking() && !rInput.eof() ) |
485 | 96.4k | { |
486 | 96.4k | std::u16string_view sEntity(sEntityBuffer.subView(0, nPos)); |
487 | 96.4k | cChar = GetHTMLCharName( sEntity ); |
488 | | |
489 | | // not found ( == 0 ): plain text |
490 | | // or a character which is inserted as attribute |
491 | 96.4k | if( 0U == cChar && ';' != nNextCh ) |
492 | 18.6k | { |
493 | 18.6k | DBG_ASSERT( rInput.Tell() - nStreamPos == |
494 | 18.6k | static_cast<sal_uInt64>(nPos+1)*GetCharSize(), |
495 | 18.6k | "UTF-8 is failing here" ); |
496 | 46.9k | for( sal_Int32 i = nPos-1; i>1; i-- ) |
497 | 29.6k | { |
498 | 29.6k | nNextCh = sEntityBuffer[i]; |
499 | 29.6k | sEntityBuffer.setLength( i ); |
500 | 29.6k | sEntity = sEntityBuffer.subView(0, i); |
501 | 29.6k | cChar = GetHTMLCharName( sEntity ); |
502 | 29.6k | if( cChar ) |
503 | 1.35k | { |
504 | 1.35k | rInput.SeekRel( -static_cast<sal_Int64> |
505 | 1.35k | (nPos-i)*GetCharSize() ); |
506 | 1.35k | nlLinePos -= sal_uInt32(nPos-i); |
507 | 1.35k | nPos = i; |
508 | 1.35k | ClearTxtConvContext(); |
509 | 1.35k | break; |
510 | 1.35k | } |
511 | 29.6k | } |
512 | 18.6k | } |
513 | | |
514 | 96.4k | if( !cChar ) // unknown character? |
515 | 25.6k | { |
516 | | // back in stream, insert '&' |
517 | | // and restart with next character |
518 | 25.6k | sTmpBuffer.append( '&' ); |
519 | | |
520 | 25.6k | DBG_ASSERT( rInput.Tell()-nStreamPos == |
521 | 25.6k | static_cast<sal_uInt64>(nPos+1)*GetCharSize(), |
522 | 25.6k | "Wrong stream position" ); |
523 | 25.6k | DBG_ASSERT( nlLinePos-nLinePos == |
524 | 25.6k | static_cast<sal_uInt32>(nPos+1), |
525 | 25.6k | "Wrong line position" ); |
526 | 25.6k | rInput.Seek( nStreamPos ); |
527 | 25.6k | nlLinePos = nLinePos; |
528 | 25.6k | ClearTxtConvContext(); |
529 | 25.6k | break; |
530 | 25.6k | } |
531 | | |
532 | 96.4k | assert(cChar != 0); |
533 | | |
534 | | // 1 == Non Breaking Space |
535 | | // 2 == SoftHyphen |
536 | | |
537 | 70.8k | if (cChar == 1 || cChar == 2) |
538 | 35.6k | { |
539 | 35.6k | if( '>' == cBreak ) |
540 | 2.05k | { |
541 | | // When reading the content of a tag we have |
542 | | // to change it to ' ' or '-' |
543 | 2.05k | if( 1U == cChar ) |
544 | 1.21k | cChar = ' '; |
545 | 843 | else //2U |
546 | 843 | cChar = '-'; |
547 | 2.05k | } |
548 | 33.6k | else |
549 | 33.6k | { |
550 | | // If not scanning a tag return token |
551 | 33.6k | aToken.append( sTmpBuffer ); |
552 | 33.6k | sTmpBuffer.setLength(0); |
553 | | |
554 | 33.6k | if( !aToken.isEmpty() ) |
555 | 9.32k | { |
556 | | // restart with character |
557 | 9.32k | nNextCh = '&'; |
558 | 9.32k | DBG_ASSERT( rInput.Tell()-nStreamPos == |
559 | 9.32k | static_cast<sal_uInt64>(nPos+1)*GetCharSize(), |
560 | 9.32k | "Wrong stream position" ); |
561 | 9.32k | DBG_ASSERT( nlLinePos-nLinePos == |
562 | 9.32k | static_cast<sal_uInt32>(nPos+1), |
563 | 9.32k | "Wrong line position" ); |
564 | 9.32k | rInput.Seek( nStreamPos ); |
565 | 9.32k | nlLinePos = nLinePos; |
566 | 9.32k | ClearTxtConvContext(); |
567 | 9.32k | return HtmlTokenId::TEXTTOKEN; |
568 | 9.32k | } |
569 | | |
570 | | // Hack: _GetNextChar shall not read the |
571 | | // next character |
572 | 24.2k | if( ';' != nNextCh ) |
573 | 915 | aToken.append( " " ); |
574 | 24.2k | if( 1U == cChar ) |
575 | 23.0k | return HtmlTokenId::NONBREAKSPACE; |
576 | 1.23k | else //2U |
577 | 1.23k | return HtmlTokenId::SOFTHYPH; |
578 | 24.2k | } |
579 | 35.6k | } |
580 | 70.8k | } |
581 | 267 | else |
582 | 267 | nNextCh = 0U; |
583 | 96.7k | } |
584 | | // &{...};-JavaScript-Macros are not supported any longer. |
585 | 34.0k | else if( IsParserWorking() ) |
586 | 34.0k | { |
587 | 34.0k | sTmpBuffer.append( '&' ); |
588 | 34.0k | bNextCh = false; |
589 | 34.0k | break; |
590 | 34.0k | } |
591 | | |
592 | 82.7k | bNextCh = (';' == nNextCh); |
593 | 82.7k | if( cBreak=='>' && (cChar=='\\' || cChar=='\'' || |
594 | 41.0k | cChar=='\"' || cChar==' ') ) |
595 | 14.6k | { |
596 | | // ' and " have to be escaped within tags to separate |
597 | | // them from ' and " enclosing options. |
598 | | // \ has to be escaped as well. |
599 | | // Space is protected because it's not a delimiter between |
600 | | // options. |
601 | 14.6k | sTmpBuffer.append( '\\' ); |
602 | 14.6k | } |
603 | 82.7k | if( IsParserWorking() ) |
604 | 82.7k | { |
605 | 82.7k | if( cChar ) |
606 | 82.4k | sTmpBuffer.appendUtf32( cChar ); |
607 | 82.7k | } |
608 | 0 | else if( SvParserState::Pending==eState && '>'!=cBreak ) |
609 | 0 | { |
610 | | // Restart with '&', the remainder is returned as |
611 | | // text token. |
612 | 0 | if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() ) |
613 | 0 | { |
614 | | // _GetNextChar() returns the previous text and |
615 | | // during the next execution a new character is read. |
616 | | // Thus we have to position in front of the '&'. |
617 | 0 | nNextCh = 0U; |
618 | 0 | rInput.Seek( nStreamPos - GetCharSize() ); |
619 | 0 | nlLinePos = nLinePos-1; |
620 | 0 | ClearTxtConvContext(); |
621 | 0 | bReadNextChar = true; |
622 | 0 | } |
623 | 0 | bNextCh = false; |
624 | 0 | } |
625 | 82.7k | } |
626 | 86.8k | break; |
627 | 1.76M | case '=': |
628 | 1.76M | if( '>'==cBreak && !cQuote ) |
629 | 1.25M | bEqSignFound = true; |
630 | 1.76M | sTmpBuffer.appendUtf32( nNextCh ); |
631 | 1.76M | break; |
632 | | |
633 | 105k | case '\\': |
634 | 105k | if( '>'==cBreak ) |
635 | 67.7k | { |
636 | | // mark within tags |
637 | 67.7k | sTmpBuffer.append( '\\' ); |
638 | 67.7k | } |
639 | 105k | sTmpBuffer.append( '\\' ); |
640 | 105k | break; |
641 | | |
642 | 2.19M | case '\"': |
643 | 2.41M | case '\'': |
644 | 2.41M | if( '>'==cBreak ) |
645 | 2.09M | { |
646 | 2.09M | if( bEqSignFound ) |
647 | 924k | cQuote = nNextCh; |
648 | 1.16M | else if( cQuote && (cQuote==nNextCh ) ) |
649 | 919k | cQuote = 0U; |
650 | 2.09M | } |
651 | 2.41M | sTmpBuffer.appendUtf32( nNextCh ); |
652 | 2.41M | bEqSignFound = false; |
653 | 2.41M | break; |
654 | | |
655 | 2.42M | case sal_Unicode(EOF): |
656 | 2.42M | if( rInput.eof() ) |
657 | 20.0k | { |
658 | 20.0k | bContinue = false; |
659 | 20.0k | } |
660 | | // else: ignore, not a valid code point |
661 | 2.42M | break; |
662 | | |
663 | 2.03M | case '<': |
664 | 2.03M | bEqSignFound = false; |
665 | 2.03M | if( '>'==cBreak ) |
666 | 758k | sTmpBuffer.appendUtf32( nNextCh ); |
667 | 1.27M | else |
668 | 1.27M | bContinue = false; // break, string is together |
669 | 2.03M | break; |
670 | | |
671 | 68.3k | case '\f': |
672 | 68.3k | if( '>' == cBreak ) |
673 | 63.0k | { |
674 | | // If scanning options treat it like a space, ... |
675 | 63.0k | sTmpBuffer.append( ' ' ); |
676 | 63.0k | } |
677 | 5.35k | else |
678 | 5.35k | { |
679 | | // otherwise it's a separate token. |
680 | 5.35k | bContinue = false; |
681 | 5.35k | } |
682 | 68.3k | break; |
683 | | |
684 | 245k | case '\r': |
685 | 885k | case '\n': |
686 | 885k | if( '>'==cBreak ) |
687 | 301k | { |
688 | | // cr/lf in tag is handled in GetNextToken_() |
689 | 301k | sTmpBuffer.appendUtf32( nNextCh ); |
690 | 301k | break; |
691 | 301k | } |
692 | 584k | else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) |
693 | 80.2k | { |
694 | 80.2k | bContinue = false; |
695 | 80.2k | break; |
696 | 80.2k | } |
697 | | // Reduce sequence of CR/LF/BLANK/TAB to a single blank |
698 | 503k | [[fallthrough]]; |
699 | 634k | case '\t': |
700 | 634k | if( '\t'==nNextCh && bReadPRE && '>'!=cBreak ) |
701 | 18.0k | { |
702 | | // Pass Tabs up in <PRE> |
703 | 18.0k | bContinue = false; |
704 | 18.0k | break; |
705 | 18.0k | } |
706 | 616k | [[fallthrough]]; |
707 | 639k | case '\x0b': |
708 | 639k | if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && |
709 | 3.20k | '>'!=cBreak ) |
710 | 2.06k | { |
711 | 2.06k | break; |
712 | 2.06k | } |
713 | 637k | if (!m_bPreserveSpaces) |
714 | 632k | nNextCh = ' '; |
715 | 637k | [[fallthrough]]; |
716 | 2.70M | case ' ': |
717 | 2.70M | if (!m_bPreserveSpaces) |
718 | 2.69M | { |
719 | 2.69M | sTmpBuffer.appendUtf32(nNextCh); |
720 | 2.69M | if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea)) |
721 | 1.12M | { |
722 | | // Reduce sequences of Blanks/Tabs/CR/LF to a single blank |
723 | 1.12M | do |
724 | 2.30M | { |
725 | 2.30M | nNextCh = GetNextChar(); |
726 | 2.30M | if (sal_Unicode(EOF) == nNextCh && rInput.eof()) |
727 | 2.86k | { |
728 | 2.86k | if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1) |
729 | 1.16k | { |
730 | | // Have seen s.th. aside from blanks? |
731 | 1.16k | aToken.append(sTmpBuffer); |
732 | 1.16k | sTmpBuffer.setLength(0); |
733 | 1.16k | return HtmlTokenId::TEXTTOKEN; |
734 | 1.16k | } |
735 | 1.70k | else |
736 | | // Only read blanks: no text must be returned |
737 | | // and GetNextToken_ has to read until EOF |
738 | 1.70k | return HtmlTokenId::NONE; |
739 | 2.86k | } |
740 | 2.30M | } while (HTML_ISSPACE(nNextCh)); |
741 | 1.12M | bNextCh = false; |
742 | 1.12M | } |
743 | 2.68M | break; |
744 | 2.69M | } |
745 | 14.2k | [[fallthrough]]; |
746 | 17.9M | default: |
747 | 17.9M | bEqSignFound = false; |
748 | 17.9M | if (nNextCh == cBreak && !cQuote) |
749 | 2.13M | bContinue = false; |
750 | 15.8M | else |
751 | 15.8M | { |
752 | 45.2M | do { |
753 | 45.2M | if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh)) |
754 | 43.4M | { |
755 | | // All remaining characters make their way into the text. |
756 | 43.4M | sTmpBuffer.appendUtf32( nNextCh ); |
757 | 43.4M | } |
758 | | |
759 | 45.2M | nNextCh = GetNextChar(); |
760 | 45.2M | if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) || |
761 | 45.2M | !IsParserWorking() ) |
762 | 38.9k | { |
763 | 38.9k | if( !sTmpBuffer.isEmpty() ) |
764 | 38.2k | aToken.append( sTmpBuffer ); |
765 | 38.9k | return HtmlTokenId::TEXTTOKEN; |
766 | 38.9k | } |
767 | 45.2M | } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) ); |
768 | 15.8M | bNextCh = false; |
769 | 15.8M | } |
770 | 30.0M | } |
771 | | |
772 | 29.9M | if( bContinue && bNextCh ) |
773 | 9.47M | nNextCh = GetNextChar(); |
774 | 29.9M | } |
775 | | |
776 | 3.54M | if( !sTmpBuffer.isEmpty() ) |
777 | 2.83M | aToken.append( sTmpBuffer ); |
778 | | |
779 | 3.54M | return HtmlTokenId::TEXTTOKEN; |
780 | 3.61M | } |
781 | | |
782 | | HtmlTokenId HTMLParser::GetNextRawToken() |
783 | 165k | { |
784 | 165k | OUStringBuffer sTmpBuffer( MAX_LEN ); |
785 | | |
786 | 165k | if( bEndTokenFound ) |
787 | 7.86k | { |
788 | | // During the last execution we already found the end token, |
789 | | // thus we don't have to search it again. |
790 | 7.86k | bReadScript = false; |
791 | 7.86k | bReadStyle = false; |
792 | 7.86k | aEndToken.clear(); |
793 | 7.86k | bEndTokenFound = false; |
794 | | |
795 | 7.86k | return HtmlTokenId::NONE; |
796 | 7.86k | } |
797 | | |
798 | | // Default return value: HtmlTokenId::RAWDATA |
799 | 157k | bool bContinue = true; |
800 | 157k | HtmlTokenId nToken = HtmlTokenId::RAWDATA; |
801 | 157k | SaveState( HtmlTokenId::NONE ); |
802 | 4.43M | while( bContinue && IsParserWorking() ) |
803 | 4.28M | { |
804 | 4.28M | bool bNextCh = true; |
805 | 4.28M | switch( nNextCh ) |
806 | 4.28M | { |
807 | 121k | case '<': |
808 | 121k | { |
809 | | // Maybe we've reached the end. |
810 | | |
811 | | // Save what we have read previously... |
812 | 121k | aToken.append( sTmpBuffer ); |
813 | 121k | sTmpBuffer.setLength(0); |
814 | | |
815 | | // and remember position in stream. |
816 | 121k | sal_uInt64 nStreamPos = rInput.Tell(); |
817 | 121k | sal_uInt32 nLineNr = GetLineNr(); |
818 | 121k | sal_uInt32 nLinePos = GetLinePos(); |
819 | | |
820 | | // Start of an end token? |
821 | 121k | bool bOffState = false; |
822 | 121k | if( '/' == (nNextCh = GetNextChar()) ) |
823 | 34.6k | { |
824 | 34.6k | bOffState = true; |
825 | 34.6k | nNextCh = GetNextChar(); |
826 | 34.6k | } |
827 | 86.7k | else if( '!' == nNextCh ) |
828 | 4.93k | { |
829 | 4.93k | sTmpBuffer.appendUtf32( nNextCh ); |
830 | 4.93k | nNextCh = GetNextChar(); |
831 | 4.93k | } |
832 | | |
833 | | // Read following letters |
834 | 518k | while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) && |
835 | 396k | IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) |
836 | 396k | { |
837 | 396k | sTmpBuffer.appendUtf32( nNextCh ); |
838 | 396k | nNextCh = GetNextChar(); |
839 | 396k | } |
840 | | |
841 | 121k | OUString aTok( sTmpBuffer.toString() ); |
842 | 121k | aTok = aTok.toAsciiLowerCase(); |
843 | 121k | bool bDone = false; |
844 | 121k | if( bReadScript || !aEndToken.isEmpty() ) |
845 | 22.6k | { |
846 | 22.6k | if( !bReadComment ) |
847 | 13.4k | { |
848 | 13.4k | if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) ) |
849 | 633 | { |
850 | 633 | bReadComment = true; |
851 | 633 | } |
852 | 12.7k | else |
853 | 12.7k | { |
854 | | // A script has to end with "</SCRIPT>". But |
855 | | // ">" is optional for security reasons |
856 | 12.7k | bDone = bOffState && |
857 | 4.68k | ( bReadScript |
858 | 4.68k | ? aTok == OOO_STRING_SVTOOLS_HTML_script |
859 | 4.68k | : aTok == aEndToken ); |
860 | 12.7k | } |
861 | 13.4k | } |
862 | 22.6k | if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) ) |
863 | 103 | { |
864 | | // End of comment of style <!-----> |
865 | 103 | bReadComment = false; |
866 | 103 | } |
867 | 22.6k | } |
868 | 98.7k | else |
869 | 98.7k | { |
870 | | // Style sheets can be closed by </STYLE>, </HEAD> or <BODY> |
871 | 98.7k | if( bOffState ) |
872 | 27.5k | bDone = aTok == OOO_STRING_SVTOOLS_HTML_style || |
873 | 15.9k | aTok == OOO_STRING_SVTOOLS_HTML_head; |
874 | 71.2k | else |
875 | 71.2k | bDone = aTok == OOO_STRING_SVTOOLS_HTML_body; |
876 | 98.7k | } |
877 | | |
878 | 121k | if( bDone ) |
879 | 15.2k | { |
880 | | // Done! Return the previously read string (if requested) |
881 | | // and continue. |
882 | | |
883 | 15.2k | bContinue = false; |
884 | | |
885 | | // nToken==0 means, GetNextToken_ continues to read |
886 | 15.2k | if( aToken.isEmpty() && (bReadStyle || bReadScript) ) |
887 | 8.15k | { |
888 | | // Immediately close environment (or context?) |
889 | | // and parse the end token |
890 | 8.15k | bReadScript = false; |
891 | 8.15k | bReadStyle = false; |
892 | 8.15k | aEndToken.clear(); |
893 | 8.15k | nToken = HtmlTokenId::NONE; |
894 | 8.15k | } |
895 | 7.12k | else |
896 | 7.12k | { |
897 | | // Keep bReadScript/bReadStyle alive |
898 | | // and parse end token during next execution |
899 | 7.12k | bEndTokenFound = true; |
900 | 7.12k | } |
901 | | |
902 | | // Move backwards in stream to '<' |
903 | 15.2k | rInput.Seek( nStreamPos ); |
904 | 15.2k | SetLineNr( nLineNr ); |
905 | 15.2k | SetLinePos( nLinePos ); |
906 | 15.2k | ClearTxtConvContext(); |
907 | 15.2k | nNextCh = '<'; |
908 | | |
909 | | // Don't append string to token. |
910 | 15.2k | sTmpBuffer.setLength( 0 ); |
911 | 15.2k | } |
912 | 106k | else |
913 | 106k | { |
914 | | // remember "</" , everything else we find in the buffer |
915 | 106k | aToken.append( "<" ); |
916 | 106k | if( bOffState ) |
917 | 20.3k | aToken.append( "/" ); |
918 | | |
919 | 106k | bNextCh = false; |
920 | 106k | } |
921 | 121k | } |
922 | 121k | break; |
923 | 77.2k | case '-': |
924 | 77.2k | sTmpBuffer.appendUtf32( nNextCh ); |
925 | 77.2k | if( bReadComment ) |
926 | 2.74k | { |
927 | 2.74k | bool bTwoMinus = false; |
928 | 2.74k | nNextCh = GetNextChar(); |
929 | 3.73k | while( '-' == nNextCh && IsParserWorking() ) |
930 | 994 | { |
931 | 994 | bTwoMinus = true; |
932 | 994 | sTmpBuffer.appendUtf32( nNextCh ); |
933 | 994 | nNextCh = GetNextChar(); |
934 | 994 | } |
935 | | |
936 | 2.74k | if( '>' == nNextCh && IsParserWorking() && bTwoMinus ) |
937 | 468 | bReadComment = false; |
938 | | |
939 | 2.74k | bNextCh = false; |
940 | 2.74k | } |
941 | 77.2k | break; |
942 | | |
943 | 31.4k | case '\r': |
944 | | // \r\n? closes the current text token (even if it's empty) |
945 | 31.4k | nNextCh = GetNextChar(); |
946 | 31.4k | if( nNextCh=='\n' ) |
947 | 29.7k | nNextCh = GetNextChar(); |
948 | 31.4k | bContinue = false; |
949 | 31.4k | break; |
950 | 110k | case '\n': |
951 | | // \n closes the current text token (even if it's empty) |
952 | 110k | nNextCh = GetNextChar(); |
953 | 110k | bContinue = false; |
954 | 110k | break; |
955 | 56.3k | case sal_Unicode(EOF): |
956 | | // eof closes the current text token and behaves like having read |
957 | | // an end token |
958 | 56.3k | if( rInput.eof() ) |
959 | 810 | { |
960 | 810 | bContinue = false; |
961 | 810 | if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() ) |
962 | 739 | { |
963 | 739 | bEndTokenFound = true; |
964 | 739 | } |
965 | 71 | else |
966 | 71 | { |
967 | 71 | bReadScript = false; |
968 | 71 | bReadStyle = false; |
969 | 71 | aEndToken.clear(); |
970 | 71 | nToken = HtmlTokenId::NONE; |
971 | 71 | } |
972 | 810 | } |
973 | 56.3k | break; |
974 | 3.88M | default: |
975 | 3.88M | if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t') |
976 | 3.77M | { |
977 | | // all remaining characters are appended to the buffer |
978 | 3.77M | sTmpBuffer.appendUtf32( nNextCh ); |
979 | 3.77M | } |
980 | 3.88M | break; |
981 | 4.28M | } |
982 | | |
983 | 4.28M | if( !bContinue && !sTmpBuffer.isEmpty() ) |
984 | 122k | { |
985 | 122k | aToken.append( sTmpBuffer ); |
986 | 122k | sTmpBuffer.setLength(0); |
987 | 122k | } |
988 | | |
989 | 4.28M | if( bContinue && bNextCh ) |
990 | 4.01M | nNextCh = GetNextChar(); |
991 | 4.28M | } |
992 | | |
993 | 157k | if( IsParserWorking() ) |
994 | 157k | SaveState( HtmlTokenId::NONE ); |
995 | 0 | else |
996 | 0 | nToken = HtmlTokenId::NONE; |
997 | | |
998 | 157k | return nToken; |
999 | 157k | } |
1000 | | |
1001 | | // Scan next token |
1002 | | HtmlTokenId HTMLParser::GetNextToken_() |
1003 | 7.20M | { |
1004 | 7.20M | HtmlTokenId nRet = HtmlTokenId::NONE; |
1005 | 7.20M | sSaveToken.clear(); |
1006 | | |
1007 | 7.20M | if (mnPendingOffToken != HtmlTokenId::NONE) |
1008 | 257k | { |
1009 | | // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON |
1010 | 257k | nRet = mnPendingOffToken; |
1011 | 257k | mnPendingOffToken = HtmlTokenId::NONE; |
1012 | 257k | aToken.setLength( 0 ); |
1013 | 257k | return nRet; |
1014 | 257k | } |
1015 | | |
1016 | | // Delete options |
1017 | 6.94M | maOptions.clear(); |
1018 | | |
1019 | 6.94M | if( !IsParserWorking() ) // Don't continue if already an error occurred |
1020 | 247k | return HtmlTokenId::NONE; |
1021 | | |
1022 | 6.69M | bool bReadNextCharSave = bReadNextChar; |
1023 | 6.69M | if( bReadNextChar ) |
1024 | 3.23k | { |
1025 | 3.23k | DBG_ASSERT( !bEndTokenFound, |
1026 | 3.23k | "Read a character despite </SCRIPT> was read?" ); |
1027 | 3.23k | nNextCh = GetNextChar(); |
1028 | 3.23k | if( !IsParserWorking() ) // Don't continue if already an error occurred |
1029 | 0 | return HtmlTokenId::NONE; |
1030 | 3.23k | bReadNextChar = false; |
1031 | 3.23k | } |
1032 | | |
1033 | 6.69M | if( bReadScript || bReadStyle || !aEndToken.isEmpty() ) |
1034 | 165k | { |
1035 | 165k | nRet = GetNextRawToken(); |
1036 | 165k | if( nRet != HtmlTokenId::NONE || !IsParserWorking() ) |
1037 | 149k | return nRet; |
1038 | 165k | } |
1039 | | |
1040 | 6.56M | do { |
1041 | 6.56M | bool bNextCh = true; |
1042 | 6.56M | switch( nNextCh ) |
1043 | 6.56M | { |
1044 | 3.74M | case '<': |
1045 | 3.74M | { |
1046 | 3.74M | sal_uInt64 nStreamPos = rInput.Tell(); |
1047 | 3.74M | sal_uInt32 nLineNr = GetLineNr(); |
1048 | 3.74M | sal_uInt32 nLinePos = GetLinePos(); |
1049 | | |
1050 | 3.74M | bool bOffState = false; |
1051 | 3.74M | if( '/' == (nNextCh = GetNextChar()) ) |
1052 | 730k | { |
1053 | 730k | bOffState = true; |
1054 | 730k | nNextCh = GetNextChar(); |
1055 | 730k | } |
1056 | | // Assume '<?' is a start of an XML declaration, ignore it. |
1057 | 3.74M | if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?') |
1058 | 3.57M | { |
1059 | 3.57M | OUStringBuffer sTmpBuffer; |
1060 | 12.6M | do { |
1061 | 12.6M | sTmpBuffer.appendUtf32( nNextCh ); |
1062 | 12.6M | nNextCh = GetNextChar(); |
1063 | 12.6M | if (std::u16string_view(sTmpBuffer) == u"![CDATA[") |
1064 | 1.30k | break; |
1065 | 12.6M | if (bFuzzing && sTmpBuffer.getLength() > 1024) |
1066 | 50 | { |
1067 | 50 | SAL_WARN("svtools", "abandoning import for performance reasons with long tokens"); |
1068 | 50 | eState = SvParserState::Error; |
1069 | 50 | break; |
1070 | 50 | } |
1071 | 12.6M | } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) && |
1072 | 9.10M | !linguistic::IsControlChar(nNextCh) && |
1073 | 9.06M | IsParserWorking() && !rInput.eof() ); |
1074 | | |
1075 | 3.57M | if( !sTmpBuffer.isEmpty() ) |
1076 | 3.57M | { |
1077 | 3.57M | aToken.append( sTmpBuffer ); |
1078 | 3.57M | sTmpBuffer.setLength(0); |
1079 | 3.57M | } |
1080 | | |
1081 | | // Skip blanks |
1082 | 4.85M | while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() ) |
1083 | 1.27M | nNextCh = GetNextChar(); |
1084 | | |
1085 | 3.57M | if( !IsParserWorking() ) |
1086 | 50 | { |
1087 | 50 | if( SvParserState::Pending == eState ) |
1088 | 0 | bReadNextChar = bReadNextCharSave; |
1089 | 50 | break; |
1090 | 50 | } |
1091 | | |
1092 | | // Search token in table: |
1093 | 3.57M | sSaveToken = aToken; |
1094 | 3.57M | aToken = aToken.toString().toAsciiLowerCase(); |
1095 | | |
1096 | 3.57M | if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace)) |
1097 | 0 | aToken.remove( 0, maNamespace.getLength()); |
1098 | | |
1099 | 3.57M | if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) ) |
1100 | | // Unknown control |
1101 | 395k | nRet = HtmlTokenId::UNKNOWNCONTROL_ON; |
1102 | | |
1103 | | // If it's a token which can be switched off... |
1104 | 3.57M | if( bOffState ) |
1105 | 718k | { |
1106 | 718k | if( nRet >= HtmlTokenId::ONOFF_START ) |
1107 | 716k | { |
1108 | | // and there is an off token, return off token instead |
1109 | 716k | nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); |
1110 | 716k | } |
1111 | 1.63k | else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty()) |
1112 | 925 | { |
1113 | | // and there is no off token, return unknown token. |
1114 | | // (except for </BR>, that is treated like <BR>) |
1115 | | // No exception for XHTML, though. |
1116 | 925 | nRet = HtmlTokenId::UNKNOWNCONTROL_OFF; |
1117 | 925 | } |
1118 | 718k | } |
1119 | | |
1120 | 3.57M | if( nRet == HtmlTokenId::COMMENT ) |
1121 | 8.57k | { |
1122 | | // fix: due to being case sensitive use sSaveToken as start of comment |
1123 | | // and append a blank. |
1124 | 8.57k | aToken = sSaveToken; |
1125 | 8.57k | if( '>'!=nNextCh ) |
1126 | 5.75k | aToken.append( " " ); |
1127 | 8.57k | sal_uInt64 nCStreamPos = 0; |
1128 | 8.57k | sal_uInt32 nCLineNr = 0; |
1129 | 8.57k | sal_uInt32 nCLinePos = 0; |
1130 | 8.57k | sal_Int32 nCStrLen = 0; |
1131 | | |
1132 | 8.57k | bool bDone = false; |
1133 | | // Read until closing -->. If not found restart at first > |
1134 | 8.57k | sTmpBuffer = aToken; |
1135 | 1.27M | while( !bDone && !rInput.eof() && IsParserWorking() ) |
1136 | 1.27M | { |
1137 | 1.27M | if( '>'==nNextCh ) |
1138 | 61.6k | { |
1139 | 61.6k | if( !nCStreamPos ) |
1140 | 7.53k | { |
1141 | 7.53k | nCStreamPos = rInput.Tell(); |
1142 | 7.53k | nCStrLen = sTmpBuffer.getLength(); |
1143 | 7.53k | nCLineNr = GetLineNr(); |
1144 | 7.53k | nCLinePos = GetLinePos(); |
1145 | 7.53k | } |
1146 | 61.6k | bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-'; |
1147 | 61.6k | if( !bDone ) |
1148 | 54.5k | sTmpBuffer.appendUtf32(nNextCh); |
1149 | 61.6k | } |
1150 | 1.20M | else if (!linguistic::IsControlChar(nNextCh) |
1151 | 77.1k | || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t') |
1152 | 1.17M | { |
1153 | 1.17M | sTmpBuffer.appendUtf32(nNextCh); |
1154 | 1.17M | } |
1155 | 1.27M | if( !bDone ) |
1156 | 1.26M | nNextCh = GetNextChar(); |
1157 | 1.27M | } |
1158 | 8.57k | aToken = sTmpBuffer; |
1159 | 8.57k | sTmpBuffer.setLength(0); |
1160 | 8.57k | if( !bDone && IsParserWorking() && nCStreamPos ) |
1161 | 428 | { |
1162 | 428 | rInput.Seek( nCStreamPos ); |
1163 | 428 | SetLineNr( nCLineNr ); |
1164 | 428 | SetLinePos( nCLinePos ); |
1165 | 428 | ClearTxtConvContext(); |
1166 | 428 | aToken.truncate(nCStrLen); |
1167 | 428 | nNextCh = '>'; |
1168 | 428 | } |
1169 | 8.57k | } |
1170 | 3.57M | else if (nRet == HtmlTokenId::CDATA) |
1171 | 1.30k | { |
1172 | | // Read until the closing ]]>. |
1173 | 1.30k | bool bDone = false; |
1174 | 300k | while (!bDone && !rInput.eof() && IsParserWorking()) |
1175 | 298k | { |
1176 | 298k | if (nNextCh == '>') |
1177 | 11.8k | { |
1178 | 11.8k | if (sTmpBuffer.getLength() >= 2) |
1179 | 11.7k | { |
1180 | 11.7k | bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']' |
1181 | 1.11k | && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']'; |
1182 | 11.7k | if (bDone) |
1183 | 1.06k | { |
1184 | | // Ignore ]] at the end. |
1185 | 1.06k | sTmpBuffer.setLength(sTmpBuffer.getLength() - 2); |
1186 | 1.06k | } |
1187 | 11.7k | } |
1188 | 11.8k | if (!bDone) |
1189 | 10.7k | { |
1190 | 10.7k | sTmpBuffer.appendUtf32(nNextCh); |
1191 | 10.7k | } |
1192 | 11.8k | } |
1193 | 287k | else if (!linguistic::IsControlChar(nNextCh)) |
1194 | 222k | { |
1195 | 222k | sTmpBuffer.appendUtf32(nNextCh); |
1196 | 222k | } |
1197 | 298k | if (!bDone) |
1198 | 297k | { |
1199 | 297k | nNextCh = GetNextChar(); |
1200 | 297k | } |
1201 | 298k | } |
1202 | 1.30k | aToken = sTmpBuffer; |
1203 | 1.30k | sTmpBuffer.setLength(0); |
1204 | 1.30k | } |
1205 | 3.56M | else |
1206 | 3.56M | { |
1207 | | // TokenString not needed anymore |
1208 | 3.56M | aToken.setLength( 0 ); |
1209 | 3.56M | } |
1210 | | |
1211 | | // Read until closing '>' |
1212 | 3.57M | if( '>' != nNextCh && IsParserWorking() ) |
1213 | 1.30M | { |
1214 | 1.30M | ScanText( '>' ); |
1215 | | |
1216 | | // fdo#34666 fdo#36080 fdo#36390: closing "/>"?: |
1217 | | // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON |
1218 | | // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF |
1219 | | // which lead to fdo#56772. |
1220 | 1.30M | if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/")) |
1221 | 257k | { |
1222 | 257k | mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF |
1223 | 257k | aToken.setLength( aToken.getLength()-1 ); // remove trailing '/' |
1224 | 257k | } |
1225 | 1.30M | if( sal_Unicode(EOF) == nNextCh && rInput.eof() ) |
1226 | 27.5k | { |
1227 | | // Move back in front of < and restart there. |
1228 | | // Return < as text. |
1229 | 27.5k | rInput.Seek( nStreamPos ); |
1230 | 27.5k | SetLineNr( nLineNr ); |
1231 | 27.5k | SetLinePos( nLinePos ); |
1232 | 27.5k | ClearTxtConvContext(); |
1233 | | |
1234 | 27.5k | aToken = "<"; |
1235 | 27.5k | nRet = HtmlTokenId::TEXTTOKEN; |
1236 | 27.5k | nNextCh = GetNextChar(); |
1237 | 27.5k | bNextCh = false; |
1238 | 27.5k | break; |
1239 | 27.5k | } |
1240 | 1.30M | } |
1241 | 3.55M | if( SvParserState::Pending == eState ) |
1242 | 0 | bReadNextChar = bReadNextCharSave; |
1243 | 3.55M | } |
1244 | 163k | else |
1245 | 163k | { |
1246 | 163k | if( bOffState ) |
1247 | 11.8k | { |
1248 | | // simply throw away everything |
1249 | 11.8k | ScanText( '>' ); |
1250 | 11.8k | if( sal_Unicode(EOF) == nNextCh && rInput.eof() ) |
1251 | 827 | { |
1252 | | // Move back in front of < and restart there. |
1253 | | // Return < as text. |
1254 | 827 | rInput.Seek( nStreamPos ); |
1255 | 827 | SetLineNr( nLineNr ); |
1256 | 827 | SetLinePos( nLinePos ); |
1257 | 827 | ClearTxtConvContext(); |
1258 | | |
1259 | 827 | aToken = "<"; |
1260 | 827 | nRet = HtmlTokenId::TEXTTOKEN; |
1261 | 827 | nNextCh = GetNextChar(); |
1262 | 827 | bNextCh = false; |
1263 | 827 | break; |
1264 | 827 | } |
1265 | 11.0k | if( SvParserState::Pending == eState ) |
1266 | 0 | bReadNextChar = bReadNextCharSave; |
1267 | 11.0k | aToken.setLength( 0 ); |
1268 | 11.0k | } |
1269 | 151k | else if( '%' == nNextCh ) |
1270 | 1.60k | { |
1271 | 1.60k | nRet = HtmlTokenId::UNKNOWNCONTROL_ON; |
1272 | | |
1273 | 1.60k | sal_uInt64 nCStreamPos = rInput.Tell(); |
1274 | 1.60k | sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos(); |
1275 | | |
1276 | 1.60k | bool bDone = false; |
1277 | | // Read until closing %>. If not found restart at first >. |
1278 | 1.60k | sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0; |
1279 | 1.60k | OUStringBuffer aTmpBuffer(aToken); |
1280 | 2.39M | while( !bDone && !rInput.eof() && IsParserWorking() ) |
1281 | 2.39M | { |
1282 | 2.39M | bDone = '>'==nNextCh && nLastTokenChar == '%'; |
1283 | 2.39M | if( !bDone ) |
1284 | 2.39M | { |
1285 | 2.39M | aTmpBuffer.appendUtf32(nNextCh); |
1286 | 2.39M | nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1]; |
1287 | 2.39M | nNextCh = GetNextChar(); |
1288 | 2.39M | } |
1289 | 2.39M | } |
1290 | 1.60k | if( !bDone && IsParserWorking() ) |
1291 | 1.01k | { |
1292 | 1.01k | rInput.Seek( nCStreamPos ); |
1293 | 1.01k | SetLineNr( nCLineNr ); |
1294 | 1.01k | SetLinePos( nCLinePos ); |
1295 | 1.01k | ClearTxtConvContext(); |
1296 | 1.01k | aToken = "<%"; |
1297 | 1.01k | nRet = HtmlTokenId::TEXTTOKEN; |
1298 | 1.01k | break; |
1299 | 1.01k | } |
1300 | 596 | aToken = aTmpBuffer; |
1301 | 596 | aTmpBuffer.setLength(0); |
1302 | 596 | if( IsParserWorking() ) |
1303 | 596 | { |
1304 | 596 | sSaveToken = aToken; |
1305 | 596 | aToken.setLength( 0 ); |
1306 | 596 | } |
1307 | 596 | } |
1308 | 149k | else |
1309 | 149k | { |
1310 | 149k | aToken = "<"; |
1311 | 149k | nRet = HtmlTokenId::TEXTTOKEN; |
1312 | 149k | bNextCh = false; |
1313 | 149k | break; |
1314 | 149k | } |
1315 | 163k | } |
1316 | | |
1317 | 3.56M | if( IsParserWorking() ) |
1318 | 3.56M | { |
1319 | 3.56M | bNextCh = '>' == nNextCh; |
1320 | 3.56M | switch( nRet ) |
1321 | 3.56M | { |
1322 | 2.19k | case HtmlTokenId::TEXTAREA_ON: |
1323 | 2.19k | bReadTextArea = true; |
1324 | 2.19k | break; |
1325 | 1.04k | case HtmlTokenId::TEXTAREA_OFF: |
1326 | 1.04k | bReadTextArea = false; |
1327 | 1.04k | break; |
1328 | 2.44k | case HtmlTokenId::SCRIPT_ON: |
1329 | 2.44k | if( !bReadTextArea ) |
1330 | 2.25k | bReadScript = true; |
1331 | 2.44k | break; |
1332 | 3.36k | case HtmlTokenId::SCRIPT_OFF: |
1333 | 3.36k | if( !bReadTextArea ) |
1334 | 3.23k | { |
1335 | 3.23k | bReadScript = false; |
1336 | | // JavaScript might modify the stream, |
1337 | | // thus the last character has to be read again. |
1338 | 3.23k | bReadNextChar = true; |
1339 | 3.23k | bNextCh = false; |
1340 | 3.23k | } |
1341 | 3.36k | break; |
1342 | | |
1343 | 13.8k | case HtmlTokenId::STYLE_ON: |
1344 | 13.8k | bReadStyle = true; |
1345 | 13.8k | break; |
1346 | 15.6k | case HtmlTokenId::STYLE_OFF: |
1347 | 15.6k | bReadStyle = false; |
1348 | 15.6k | break; |
1349 | 3.52M | default: break; |
1350 | 3.56M | } |
1351 | 3.56M | } |
1352 | 3.56M | } |
1353 | 3.56M | break; |
1354 | | |
1355 | 3.56M | case sal_Unicode(EOF): |
1356 | 43.8k | if( rInput.eof() ) |
1357 | 42.7k | { |
1358 | 42.7k | eState = SvParserState::Accepted; |
1359 | 42.7k | nRet = HtmlTokenId(nNextCh); |
1360 | 42.7k | } |
1361 | 1.11k | else |
1362 | 1.11k | { |
1363 | | // Read normal text. |
1364 | 1.11k | goto scan_text; |
1365 | 1.11k | } |
1366 | 42.7k | break; |
1367 | | |
1368 | 127k | case '\f': |
1369 | | // form feeds are passed upwards separately |
1370 | 127k | nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR |
1371 | 127k | break; |
1372 | | |
1373 | 446k | case '\n': |
1374 | 595k | case '\r': |
1375 | 595k | if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) |
1376 | 189k | { |
1377 | 189k | sal_Unicode c = GetNextChar(); |
1378 | 189k | if( ( '\n' != nNextCh || '\r' != c ) && |
1379 | 188k | ( '\r' != nNextCh || '\n' != c ) ) |
1380 | 173k | { |
1381 | 173k | bNextCh = false; |
1382 | 173k | nNextCh = c; |
1383 | 173k | } |
1384 | 189k | nRet = HtmlTokenId::NEWPARA; |
1385 | 189k | break; |
1386 | 189k | } |
1387 | 406k | [[fallthrough]]; |
1388 | 575k | case '\t': |
1389 | 575k | if( bReadPRE ) |
1390 | 162k | { |
1391 | 162k | nRet = HtmlTokenId::TABCHAR; |
1392 | 162k | break; |
1393 | 162k | } |
1394 | 412k | [[fallthrough]]; |
1395 | 504k | case ' ': |
1396 | 504k | [[fallthrough]]; |
1397 | 2.29M | default: |
1398 | | |
1399 | 2.29M | scan_text: |
1400 | | // "normal" text to come |
1401 | 2.29M | nRet = ScanText(); |
1402 | 2.29M | bNextCh = 0 == aToken.getLength(); |
1403 | | |
1404 | | // the text should be processed |
1405 | 2.29M | if( !bNextCh && eState == SvParserState::Pending ) |
1406 | 0 | { |
1407 | 0 | eState = SvParserState::Working; |
1408 | 0 | bReadNextChar = true; |
1409 | 0 | } |
1410 | | |
1411 | 2.29M | break; |
1412 | 6.56M | } |
1413 | | |
1414 | 6.56M | if( bNextCh && SvParserState::Working == eState ) |
1415 | 4.58M | { |
1416 | 4.58M | nNextCh = GetNextChar(); |
1417 | 4.58M | if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet ) |
1418 | 0 | { |
1419 | 0 | bReadNextChar = true; |
1420 | 0 | eState = SvParserState::Working; |
1421 | 0 | } |
1422 | 4.58M | } |
1423 | | |
1424 | 6.56M | } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState ); |
1425 | | |
1426 | 6.54M | if( SvParserState::Pending == eState ) |
1427 | 0 | nRet = HtmlTokenId::INVALID; // s.th. invalid |
1428 | | |
1429 | 6.54M | return nRet; |
1430 | 6.54M | } |
1431 | | |
1432 | | void HTMLParser::UnescapeToken() |
1433 | 13.2k | { |
1434 | 13.2k | sal_Int32 nPos=0; |
1435 | | |
1436 | 13.2k | bool bEscape = false; |
1437 | 456k | while( nPos < aToken.getLength() ) |
1438 | 443k | { |
1439 | 443k | bool bOldEscape = bEscape; |
1440 | 443k | bEscape = false; |
1441 | 443k | if( '\\'==aToken[nPos] && !bOldEscape ) |
1442 | 1.07k | { |
1443 | 1.07k | aToken.remove( nPos, 1 ); |
1444 | 1.07k | bEscape = true; |
1445 | 1.07k | } |
1446 | 442k | else |
1447 | 442k | { |
1448 | 442k | nPos++; |
1449 | 442k | } |
1450 | 443k | } |
1451 | 13.2k | } |
1452 | | |
1453 | | const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken ) |
1454 | 2.28M | { |
1455 | | // If the options for the current token have already been returned, |
1456 | | // return them once again. |
1457 | 2.28M | if (!maOptions.empty()) |
1458 | 82.9k | return maOptions; |
1459 | | |
1460 | 2.20M | sal_Int32 nPos = 0; |
1461 | 4.54M | while( nPos < aToken.getLength() ) |
1462 | 2.33M | { |
1463 | | // A letter? Option beginning here. |
1464 | 2.33M | if( rtl::isAsciiAlpha( aToken[nPos] ) ) |
1465 | 1.41M | { |
1466 | 1.41M | HtmlOptionId nToken; |
1467 | 1.41M | OUString aValue; |
1468 | 1.41M | sal_Int32 nStt = nPos; |
1469 | 1.41M | sal_Unicode cChar = 0; |
1470 | | |
1471 | | // Actually only certain characters allowed. |
1472 | | // Netscape only looks for "=" and white space (c.f. |
1473 | | // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c) |
1474 | 9.24M | while( nPos < aToken.getLength() ) |
1475 | 9.03M | { |
1476 | 9.03M | cChar = aToken[nPos]; |
1477 | 9.03M | if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) ) |
1478 | 1.20M | break; |
1479 | 7.83M | nPos++; |
1480 | 7.83M | } |
1481 | | |
1482 | 1.41M | OUString sName( aToken.subView( nStt, nPos-nStt ) ); |
1483 | | |
1484 | | // PlugIns require original token name. Convert to lower case only for searching. |
1485 | 1.41M | nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready |
1486 | 1.41M | SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools", |
1487 | 1.41M | "GetOption: unknown HTML option '" << sName << "'" ); |
1488 | 1.41M | bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START || |
1489 | 392k | nToken >= HtmlOptionId::SCRIPT_END) && |
1490 | 1.40M | (!pNoConvertToken || nToken != *pNoConvertToken); |
1491 | | |
1492 | 1.56M | while( nPos < aToken.getLength() ) |
1493 | 1.34M | { |
1494 | 1.34M | cChar = aToken[nPos]; |
1495 | 1.34M | if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) ) |
1496 | 1.19M | break; |
1497 | 153k | nPos++; |
1498 | 153k | } |
1499 | | |
1500 | | // Option with value? |
1501 | 1.41M | if( nPos!=aToken.getLength() && '='==cChar ) |
1502 | 1.07M | { |
1503 | 1.07M | nPos++; |
1504 | | |
1505 | 1.08M | while( nPos < aToken.getLength() ) |
1506 | 1.07M | { |
1507 | 1.07M | cChar = aToken[nPos]; |
1508 | 1.07M | if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar ) |
1509 | 1.07M | break; |
1510 | 7.57k | nPos++; |
1511 | 7.57k | } |
1512 | | |
1513 | 1.07M | if( nPos != aToken.getLength() ) |
1514 | 1.07M | { |
1515 | 1.07M | sal_Int32 nLen = 0; |
1516 | 1.07M | nStt = nPos; |
1517 | 1.07M | if( ('"'==cChar) || '\''==cChar ) |
1518 | 810k | { |
1519 | 810k | sal_Unicode cEnd = cChar; |
1520 | 810k | nPos++; nStt++; |
1521 | 810k | bool bDone = false; |
1522 | 810k | bool bEscape = false; |
1523 | 19.0M | while( nPos < aToken.getLength() && !bDone ) |
1524 | 18.2M | { |
1525 | 18.2M | bool bOldEscape = bEscape; |
1526 | 18.2M | bEscape = false; |
1527 | 18.2M | cChar = aToken[nPos]; |
1528 | 18.2M | switch( cChar ) |
1529 | 18.2M | { |
1530 | 22.8k | case '\r': |
1531 | 94.8k | case '\n': |
1532 | 94.8k | if( bStripCRLF ) |
1533 | 84.9k | aToken.remove( nPos, 1 ); |
1534 | 9.86k | else |
1535 | 9.86k | { |
1536 | 9.86k | nPos++; |
1537 | 9.86k | nLen++; |
1538 | 9.86k | } |
1539 | 94.8k | break; |
1540 | 51.9k | case '\\': |
1541 | 51.9k | if( bOldEscape ) |
1542 | 20.2k | { |
1543 | 20.2k | nPos++; |
1544 | 20.2k | nLen++; |
1545 | 20.2k | } |
1546 | 31.6k | else |
1547 | 31.6k | { |
1548 | 31.6k | aToken.remove( nPos, 1 ); |
1549 | 31.6k | bEscape = true; |
1550 | 31.6k | } |
1551 | 51.9k | break; |
1552 | 814k | case '"': |
1553 | 876k | case '\'': |
1554 | 876k | bDone = !bOldEscape && cChar==cEnd; |
1555 | 876k | if( !bDone ) |
1556 | 65.7k | { |
1557 | 65.7k | nPos++; |
1558 | 65.7k | nLen++; |
1559 | 65.7k | } |
1560 | 876k | break; |
1561 | 17.1M | default: |
1562 | 17.1M | nPos++; |
1563 | 17.1M | nLen++; |
1564 | 17.1M | break; |
1565 | 18.2M | } |
1566 | 18.2M | } |
1567 | 810k | if( nPos!=aToken.getLength() ) |
1568 | 810k | nPos++; |
1569 | 810k | } |
1570 | 260k | else |
1571 | 260k | { |
1572 | | // More liberal than the standard: allow all printable characters |
1573 | 260k | bool bEscape = false; |
1574 | 260k | bool bDone = false; |
1575 | 2.20M | while( nPos < aToken.getLength() && !bDone ) |
1576 | 1.94M | { |
1577 | 1.94M | bool bOldEscape = bEscape; |
1578 | 1.94M | bEscape = false; |
1579 | 1.94M | sal_Unicode c = aToken[nPos]; |
1580 | 1.94M | switch( c ) |
1581 | 1.94M | { |
1582 | 104k | case ' ': |
1583 | 104k | bDone = !bOldEscape; |
1584 | 104k | if( !bDone ) |
1585 | 331 | { |
1586 | 331 | nPos++; |
1587 | 331 | nLen++; |
1588 | 331 | } |
1589 | 104k | break; |
1590 | | |
1591 | 246 | case '\t': |
1592 | 1.29k | case '\r': |
1593 | 2.68k | case '\n': |
1594 | 2.68k | bDone = true; |
1595 | 2.68k | break; |
1596 | | |
1597 | 31.6k | case '\\': |
1598 | 31.6k | if( bOldEscape ) |
1599 | 15.6k | { |
1600 | 15.6k | nPos++; |
1601 | 15.6k | nLen++; |
1602 | 15.6k | } |
1603 | 16.0k | else |
1604 | 16.0k | { |
1605 | 16.0k | aToken.remove( nPos, 1 ); |
1606 | 16.0k | bEscape = true; |
1607 | 16.0k | } |
1608 | 31.6k | break; |
1609 | | |
1610 | 1.80M | default: |
1611 | 1.80M | if( HTML_ISPRINTABLE( c ) ) |
1612 | 1.80M | { |
1613 | 1.80M | nPos++; |
1614 | 1.80M | nLen++; |
1615 | 1.80M | } |
1616 | 353 | else |
1617 | 353 | bDone = true; |
1618 | 1.80M | break; |
1619 | 1.94M | } |
1620 | 1.94M | } |
1621 | 260k | } |
1622 | | |
1623 | 1.07M | if( nLen ) |
1624 | 1.06M | aValue = aToken.subView( nStt, nLen ); |
1625 | 1.07M | } |
1626 | 1.07M | } |
1627 | | |
1628 | | // Token is known and can be saved |
1629 | 1.41M | maOptions.emplace_back(nToken, sName, aValue); |
1630 | | |
1631 | 1.41M | } |
1632 | 926k | else |
1633 | | // Ignore white space and unexpected characters |
1634 | 926k | nPos++; |
1635 | 2.33M | } |
1636 | | |
1637 | 2.20M | return maOptions; |
1638 | 2.20M | } |
1639 | | |
1640 | | HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken ) |
1641 | 1.00M | { |
1642 | 1.00M | switch( nToken ) |
1643 | 1.00M | { |
1644 | | // in Netscape they only have impact in not empty paragraphs |
1645 | 7.34k | case HtmlTokenId::PARABREAK_ON: |
1646 | 7.34k | nToken = HtmlTokenId::LINEBREAK; |
1647 | 7.34k | [[fallthrough]]; |
1648 | 13.3k | case HtmlTokenId::LINEBREAK: |
1649 | 154k | case HtmlTokenId::NEWPARA: |
1650 | 154k | nPre_LinePos = 0; |
1651 | 154k | if( bPre_IgnoreNewPara ) |
1652 | 2.50k | nToken = HtmlTokenId::NONE; |
1653 | 154k | break; |
1654 | | |
1655 | 162k | case HtmlTokenId::TABCHAR: |
1656 | 162k | { |
1657 | 162k | sal_Int32 nSpaces = 8 - (nPre_LinePos % 8); |
1658 | 162k | DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" ); |
1659 | 162k | if (aToken.getLength() < nSpaces) |
1660 | 162k | { |
1661 | 162k | comphelper::string::padToLength(aToken, nSpaces, ' '); |
1662 | 162k | } |
1663 | 162k | nPre_LinePos += nSpaces; |
1664 | 162k | nToken = HtmlTokenId::TEXTTOKEN; |
1665 | 162k | } |
1666 | 162k | break; |
1667 | | // Keep those |
1668 | 339k | case HtmlTokenId::TEXTTOKEN: |
1669 | 339k | nPre_LinePos += aToken.getLength(); |
1670 | 339k | break; |
1671 | | |
1672 | 80 | case HtmlTokenId::SELECT_ON: |
1673 | 401 | case HtmlTokenId::SELECT_OFF: |
1674 | 401 | case HtmlTokenId::BODY_ON: |
1675 | 850 | case HtmlTokenId::FORM_ON: |
1676 | 3.24k | case HtmlTokenId::FORM_OFF: |
1677 | 3.86k | case HtmlTokenId::INPUT: |
1678 | 3.95k | case HtmlTokenId::OPTION: |
1679 | 4.20k | case HtmlTokenId::TEXTAREA_ON: |
1680 | 4.93k | case HtmlTokenId::TEXTAREA_OFF: |
1681 | | |
1682 | 11.1k | case HtmlTokenId::IMAGE: |
1683 | 11.3k | case HtmlTokenId::APPLET_ON: |
1684 | 11.6k | case HtmlTokenId::APPLET_OFF: |
1685 | 11.7k | case HtmlTokenId::PARAM: |
1686 | 12.3k | case HtmlTokenId::EMBED: |
1687 | | |
1688 | 14.9k | case HtmlTokenId::HEAD1_ON: |
1689 | 16.7k | case HtmlTokenId::HEAD1_OFF: |
1690 | 21.4k | case HtmlTokenId::HEAD2_ON: |
1691 | 23.4k | case HtmlTokenId::HEAD2_OFF: |
1692 | 24.8k | case HtmlTokenId::HEAD3_ON: |
1693 | 25.2k | case HtmlTokenId::HEAD3_OFF: |
1694 | 27.2k | case HtmlTokenId::HEAD4_ON: |
1695 | 27.9k | case HtmlTokenId::HEAD4_OFF: |
1696 | 30.0k | case HtmlTokenId::HEAD5_ON: |
1697 | 30.2k | case HtmlTokenId::HEAD5_OFF: |
1698 | 31.3k | case HtmlTokenId::HEAD6_ON: |
1699 | 31.5k | case HtmlTokenId::HEAD6_OFF: |
1700 | 32.1k | case HtmlTokenId::BLOCKQUOTE_ON: |
1701 | 32.4k | case HtmlTokenId::BLOCKQUOTE_OFF: |
1702 | 32.6k | case HtmlTokenId::ADDRESS_ON: |
1703 | 32.7k | case HtmlTokenId::ADDRESS_OFF: |
1704 | 36.0k | case HtmlTokenId::HORZRULE: |
1705 | | |
1706 | 39.4k | case HtmlTokenId::CENTER_ON: |
1707 | 40.2k | case HtmlTokenId::CENTER_OFF: |
1708 | 49.4k | case HtmlTokenId::DIVISION_ON: |
1709 | 52.3k | case HtmlTokenId::DIVISION_OFF: |
1710 | | |
1711 | 52.6k | case HtmlTokenId::SCRIPT_ON: |
1712 | 52.8k | case HtmlTokenId::SCRIPT_OFF: |
1713 | 63.6k | case HtmlTokenId::RAWDATA: |
1714 | | |
1715 | 71.8k | case HtmlTokenId::TABLE_ON: |
1716 | 76.2k | case HtmlTokenId::TABLE_OFF: |
1717 | 77.1k | case HtmlTokenId::CAPTION_ON: |
1718 | 77.4k | case HtmlTokenId::CAPTION_OFF: |
1719 | 79.1k | case HtmlTokenId::COLGROUP_ON: |
1720 | 80.1k | case HtmlTokenId::COLGROUP_OFF: |
1721 | 84.1k | case HtmlTokenId::COL_ON: |
1722 | 84.3k | case HtmlTokenId::COL_OFF: |
1723 | 84.4k | case HtmlTokenId::THEAD_ON: |
1724 | 84.7k | case HtmlTokenId::THEAD_OFF: |
1725 | 85.3k | case HtmlTokenId::TFOOT_ON: |
1726 | 85.6k | case HtmlTokenId::TFOOT_OFF: |
1727 | 86.4k | case HtmlTokenId::TBODY_ON: |
1728 | 86.9k | case HtmlTokenId::TBODY_OFF: |
1729 | 101k | case HtmlTokenId::TABLEROW_ON: |
1730 | 104k | case HtmlTokenId::TABLEROW_OFF: |
1731 | 171k | case HtmlTokenId::TABLEDATA_ON: |
1732 | 194k | case HtmlTokenId::TABLEDATA_OFF: |
1733 | 200k | case HtmlTokenId::TABLEHEADER_ON: |
1734 | 201k | case HtmlTokenId::TABLEHEADER_OFF: |
1735 | | |
1736 | 207k | case HtmlTokenId::ANCHOR_ON: |
1737 | 212k | case HtmlTokenId::ANCHOR_OFF: |
1738 | 215k | case HtmlTokenId::BOLD_ON: |
1739 | 217k | case HtmlTokenId::BOLD_OFF: |
1740 | 220k | case HtmlTokenId::ITALIC_ON: |
1741 | 222k | case HtmlTokenId::ITALIC_OFF: |
1742 | 222k | case HtmlTokenId::STRIKE_ON: |
1743 | 223k | case HtmlTokenId::STRIKE_OFF: |
1744 | 223k | case HtmlTokenId::STRIKETHROUGH_ON: |
1745 | 224k | case HtmlTokenId::STRIKETHROUGH_OFF: |
1746 | 227k | case HtmlTokenId::UNDERLINE_ON: |
1747 | 228k | case HtmlTokenId::UNDERLINE_OFF: |
1748 | 228k | case HtmlTokenId::BASEFONT_ON: |
1749 | 228k | case HtmlTokenId::BASEFONT_OFF: |
1750 | 230k | case HtmlTokenId::FONT_ON: |
1751 | 231k | case HtmlTokenId::FONT_OFF: |
1752 | 232k | case HtmlTokenId::BLINK_ON: |
1753 | 232k | case HtmlTokenId::BLINK_OFF: |
1754 | 235k | case HtmlTokenId::SPAN_ON: |
1755 | 237k | case HtmlTokenId::SPAN_OFF: |
1756 | 237k | case HtmlTokenId::SUBSCRIPT_ON: |
1757 | 238k | case HtmlTokenId::SUBSCRIPT_OFF: |
1758 | 238k | case HtmlTokenId::SUPERSCRIPT_ON: |
1759 | 238k | case HtmlTokenId::SUPERSCRIPT_OFF: |
1760 | 239k | case HtmlTokenId::BIGPRINT_ON: |
1761 | 240k | case HtmlTokenId::BIGPRINT_OFF: |
1762 | 240k | case HtmlTokenId::SMALLPRINT_OFF: |
1763 | 241k | case HtmlTokenId::SMALLPRINT_ON: |
1764 | | |
1765 | 249k | case HtmlTokenId::EMPHASIS_ON: |
1766 | 250k | case HtmlTokenId::EMPHASIS_OFF: |
1767 | 250k | case HtmlTokenId::CITATION_ON: |
1768 | 250k | case HtmlTokenId::CITATION_OFF: |
1769 | 251k | case HtmlTokenId::STRONG_ON: |
1770 | 251k | case HtmlTokenId::STRONG_OFF: |
1771 | 253k | case HtmlTokenId::CODE_ON: |
1772 | 253k | case HtmlTokenId::CODE_OFF: |
1773 | 254k | case HtmlTokenId::SAMPLE_ON: |
1774 | 254k | case HtmlTokenId::SAMPLE_OFF: |
1775 | 255k | case HtmlTokenId::KEYBOARD_ON: |
1776 | 255k | case HtmlTokenId::KEYBOARD_OFF: |
1777 | 256k | case HtmlTokenId::VARIABLE_ON: |
1778 | 257k | case HtmlTokenId::VARIABLE_OFF: |
1779 | 257k | case HtmlTokenId::DEFINSTANCE_ON: |
1780 | 257k | case HtmlTokenId::DEFINSTANCE_OFF: |
1781 | 258k | case HtmlTokenId::SHORTQUOTE_ON: |
1782 | 258k | case HtmlTokenId::SHORTQUOTE_OFF: |
1783 | 258k | case HtmlTokenId::LANGUAGE_ON: |
1784 | 258k | case HtmlTokenId::LANGUAGE_OFF: |
1785 | 259k | case HtmlTokenId::AUTHOR_ON: |
1786 | 259k | case HtmlTokenId::AUTHOR_OFF: |
1787 | 259k | case HtmlTokenId::PERSON_ON: |
1788 | 259k | case HtmlTokenId::PERSON_OFF: |
1789 | 260k | case HtmlTokenId::ACRONYM_ON: |
1790 | 260k | case HtmlTokenId::ACRONYM_OFF: |
1791 | 260k | case HtmlTokenId::ABBREVIATION_ON: |
1792 | 260k | case HtmlTokenId::ABBREVIATION_OFF: |
1793 | 261k | case HtmlTokenId::INSERTEDTEXT_ON: |
1794 | 261k | case HtmlTokenId::INSERTEDTEXT_OFF: |
1795 | 262k | case HtmlTokenId::DELETEDTEXT_ON: |
1796 | 262k | case HtmlTokenId::DELETEDTEXT_OFF: |
1797 | 264k | case HtmlTokenId::TELETYPE_ON: |
1798 | 265k | case HtmlTokenId::TELETYPE_OFF: |
1799 | | |
1800 | 265k | break; |
1801 | | |
1802 | | // The remainder is treated as an unknown token. |
1803 | 83.0k | default: |
1804 | 83.0k | if( nToken != HtmlTokenId::NONE ) |
1805 | 83.0k | { |
1806 | 83.0k | nToken = |
1807 | 83.0k | ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken)) |
1808 | 83.0k | ? HtmlTokenId::UNKNOWNCONTROL_OFF |
1809 | 83.0k | : HtmlTokenId::UNKNOWNCONTROL_ON ); |
1810 | 83.0k | } |
1811 | 83.0k | break; |
1812 | 1.00M | } |
1813 | | |
1814 | 1.00M | bPre_IgnoreNewPara = false; |
1815 | | |
1816 | 1.00M | return nToken; |
1817 | 1.00M | } |
1818 | | |
1819 | | HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken ) |
1820 | 130k | { |
1821 | 130k | switch( nToken ) |
1822 | 130k | { |
1823 | 41.3k | case HtmlTokenId::NEWPARA: |
1824 | 41.3k | if( bPre_IgnoreNewPara ) |
1825 | 599 | nToken = HtmlTokenId::NONE; |
1826 | 41.3k | [[fallthrough]]; |
1827 | 91.3k | case HtmlTokenId::TEXTTOKEN: |
1828 | 91.3k | case HtmlTokenId::NONBREAKSPACE: |
1829 | 91.3k | case HtmlTokenId::SOFTHYPH: |
1830 | 91.3k | break; // kept |
1831 | | |
1832 | 38.9k | default: |
1833 | 38.9k | if( nToken != HtmlTokenId::NONE ) |
1834 | 38.9k | { |
1835 | 38.9k | if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) ) |
1836 | 10.9k | { |
1837 | 10.9k | sSaveToken = "</" + sSaveToken; |
1838 | 10.9k | } |
1839 | 27.9k | else |
1840 | 27.9k | sSaveToken = "<" + sSaveToken; |
1841 | 38.9k | if( !aToken.isEmpty() ) |
1842 | 13.1k | { |
1843 | 13.1k | UnescapeToken(); |
1844 | 13.1k | sSaveToken += " "; |
1845 | 13.1k | aToken.insert(0, sSaveToken); |
1846 | 13.1k | } |
1847 | 25.8k | else |
1848 | 25.8k | aToken = sSaveToken; |
1849 | 38.9k | aToken.append( ">" ); |
1850 | 38.9k | nToken = HtmlTokenId::TEXTTOKEN; |
1851 | 38.9k | } |
1852 | 38.9k | break; |
1853 | 130k | } |
1854 | | |
1855 | 130k | bPre_IgnoreNewPara = false; |
1856 | | |
1857 | 130k | return nToken; |
1858 | 130k | } |
1859 | | |
1860 | | HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken ) |
1861 | 20.3k | { |
1862 | 20.3k | switch( nToken ) |
1863 | 20.3k | { |
1864 | 6.13k | case HtmlTokenId::NEWPARA: |
1865 | 6.13k | if( bPre_IgnoreNewPara ) |
1866 | 243 | nToken = HtmlTokenId::NONE; |
1867 | 6.13k | [[fallthrough]]; |
1868 | 13.5k | case HtmlTokenId::TEXTTOKEN: |
1869 | 14.0k | case HtmlTokenId::NONBREAKSPACE: |
1870 | 14.4k | case HtmlTokenId::SOFTHYPH: |
1871 | 14.4k | break; // kept |
1872 | | |
1873 | 5.89k | default: |
1874 | 5.89k | if( nToken != HtmlTokenId::NONE ) |
1875 | 5.89k | { |
1876 | 5.89k | nToken = |
1877 | 5.89k | ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken)) |
1878 | 5.89k | ? HtmlTokenId::UNKNOWNCONTROL_OFF |
1879 | 5.89k | : HtmlTokenId::UNKNOWNCONTROL_ON ); |
1880 | 5.89k | } |
1881 | 5.89k | break; |
1882 | 20.3k | } |
1883 | | |
1884 | 20.3k | bPre_IgnoreNewPara = false; |
1885 | | |
1886 | 20.3k | return nToken; |
1887 | 20.3k | } |
1888 | | |
1889 | | bool HTMLParser::InternalImgToPrivateURL( OUString& rURL ) |
1890 | 51.8k | { |
1891 | 51.8k | bool bFound = false; |
1892 | | |
1893 | 51.8k | if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) ) |
1894 | 0 | { |
1895 | 0 | OUString aName( rURL.copy(14) ); |
1896 | 0 | switch( aName[0] ) |
1897 | 0 | { |
1898 | 0 | case 'b': |
1899 | 0 | bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata; |
1900 | 0 | break; |
1901 | 0 | case 'd': |
1902 | 0 | bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed; |
1903 | 0 | break; |
1904 | 0 | case 'e': |
1905 | 0 | bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed; |
1906 | 0 | break; |
1907 | 0 | case 'i': |
1908 | 0 | bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure; |
1909 | 0 | break; |
1910 | 0 | case 'n': |
1911 | 0 | bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound; |
1912 | 0 | break; |
1913 | 0 | } |
1914 | 0 | } |
1915 | 51.8k | if( bFound ) |
1916 | 0 | { |
1917 | 0 | OUString sTmp ( rURL ); |
1918 | 0 | rURL = OOO_STRING_SVTOOLS_HTML_private_image; |
1919 | 0 | rURL += sTmp; |
1920 | 0 | } |
1921 | | |
1922 | 51.8k | return bFound; |
1923 | 51.8k | } |
1924 | | |
1925 | | namespace { |
1926 | | |
1927 | | enum class HtmlMeta { |
1928 | | NONE = 0, |
1929 | | Author, |
1930 | | Description, |
1931 | | Keywords, |
1932 | | Refresh, |
1933 | | Classification, |
1934 | | Created, |
1935 | | ChangedBy, |
1936 | | Changed, |
1937 | | Generator, |
1938 | | SDFootnote, |
1939 | | SDEndnote, |
1940 | | ContentType |
1941 | | }; |
1942 | | |
1943 | | } |
1944 | | |
1945 | | // <META NAME=xxx> |
1946 | | HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] = |
1947 | | { |
1948 | | { OOO_STRING_SVTOOLS_HTML_META_author, HtmlMeta::Author }, |
1949 | | { OOO_STRING_SVTOOLS_HTML_META_changed, HtmlMeta::Changed }, |
1950 | | { OOO_STRING_SVTOOLS_HTML_META_changedby, HtmlMeta::ChangedBy }, |
1951 | | { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification}, |
1952 | | { OOO_STRING_SVTOOLS_HTML_META_content_type, HtmlMeta::ContentType }, |
1953 | | { OOO_STRING_SVTOOLS_HTML_META_created, HtmlMeta::Created }, |
1954 | | { OOO_STRING_SVTOOLS_HTML_META_description, HtmlMeta::Description }, |
1955 | | { OOO_STRING_SVTOOLS_HTML_META_keywords, HtmlMeta::Keywords }, |
1956 | | { OOO_STRING_SVTOOLS_HTML_META_generator, HtmlMeta::Generator }, |
1957 | | { OOO_STRING_SVTOOLS_HTML_META_refresh, HtmlMeta::Refresh }, |
1958 | | { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HtmlMeta::SDEndnote }, |
1959 | | { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HtmlMeta::SDFootnote }, |
1960 | | { nullptr, HtmlMeta(0) } |
1961 | | }; |
1962 | | |
1963 | | |
1964 | | void HTMLParser::AddMetaUserDefined( OUString const & ) |
1965 | 0 | { |
1966 | 0 | } |
1967 | | |
1968 | | bool HTMLParser::ParseMetaOptionsImpl( |
1969 | | const uno::Reference<document::XDocumentProperties> & i_xDocProps, |
1970 | | SvKeyValueIterator *i_pHTTPHeader, |
1971 | | const HTMLOptions& aOptions, |
1972 | | rtl_TextEncoding& o_rEnc ) |
1973 | 63.1k | { |
1974 | 63.1k | OUString aName, aContent; |
1975 | 63.1k | HtmlMeta nAction = HtmlMeta::NONE; |
1976 | 63.1k | bool bHTTPEquiv = false, bChanged = false; |
1977 | | |
1978 | 171k | for ( size_t i = aOptions.size(); i; ) |
1979 | 108k | { |
1980 | 108k | const HTMLOption& aOption = aOptions[--i]; |
1981 | 108k | switch ( aOption.GetToken() ) |
1982 | 108k | { |
1983 | 37.5k | case HtmlOptionId::NAME: |
1984 | 37.5k | aName = aOption.GetString(); |
1985 | 37.5k | if ( HtmlMeta::NONE==nAction ) |
1986 | 37.3k | { |
1987 | 37.3k | aOption.GetEnum( nAction, aHTMLMetaNameTable ); |
1988 | 37.3k | } |
1989 | 37.5k | break; |
1990 | 5.92k | case HtmlOptionId::HTTPEQUIV: |
1991 | 5.92k | aName = aOption.GetString(); |
1992 | 5.92k | aOption.GetEnum( nAction, aHTMLMetaNameTable ); |
1993 | 5.92k | bHTTPEquiv = true; |
1994 | 5.92k | break; |
1995 | 35.2k | case HtmlOptionId::CONTENT: |
1996 | 35.2k | aContent = aOption.GetString(); |
1997 | 35.2k | break; |
1998 | 1.05k | case HtmlOptionId::CHARSET: |
1999 | 1.05k | { |
2000 | 1.05k | OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US)); |
2001 | 1.05k | o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr())); |
2002 | 1.05k | break; |
2003 | 0 | } |
2004 | 28.5k | default: break; |
2005 | 108k | } |
2006 | 108k | } |
2007 | | |
2008 | 63.1k | if ( bHTTPEquiv || HtmlMeta::Description != nAction ) |
2009 | 63.0k | { |
2010 | | // if it is not a Description, remove CRs and LFs from CONTENT |
2011 | 63.0k | aContent = aContent.replaceAll("\r", "").replaceAll("\n", ""); |
2012 | 63.0k | } |
2013 | 178 | else |
2014 | 178 | { |
2015 | | // convert line endings for Description |
2016 | 178 | aContent = convertLineEnd(aContent, GetSystemLineEnd()); |
2017 | 178 | } |
2018 | | |
2019 | 63.1k | if ( bHTTPEquiv && i_pHTTPHeader ) |
2020 | 5.86k | { |
2021 | | // Netscape seems to just ignore a closing ", so we do too |
2022 | 5.86k | if ( aContent.endsWith("\"") ) |
2023 | 70 | { |
2024 | 70 | aContent = aContent.copy( 0, aContent.getLength() - 1 ); |
2025 | 70 | } |
2026 | 5.86k | SvKeyValue aKeyValue( aName, aContent ); |
2027 | 5.86k | i_pHTTPHeader->Append( aKeyValue ); |
2028 | 5.86k | } |
2029 | | |
2030 | 63.1k | switch ( nAction ) |
2031 | 63.1k | { |
2032 | 1.47k | case HtmlMeta::Author: |
2033 | 1.47k | if (i_xDocProps.is()) { |
2034 | 1.47k | i_xDocProps->setAuthor( aContent ); |
2035 | 1.47k | bChanged = true; |
2036 | 1.47k | } |
2037 | 1.47k | break; |
2038 | 178 | case HtmlMeta::Description: |
2039 | 178 | if (i_xDocProps.is()) { |
2040 | 178 | i_xDocProps->setDescription( aContent ); |
2041 | 178 | bChanged = true; |
2042 | 178 | } |
2043 | 178 | break; |
2044 | 3.23k | case HtmlMeta::Keywords: |
2045 | 3.23k | if (i_xDocProps.is()) { |
2046 | 3.23k | i_xDocProps->setKeywords( |
2047 | 3.23k | ::comphelper::string::convertCommaSeparated(aContent)); |
2048 | 3.23k | bChanged = true; |
2049 | 3.23k | } |
2050 | 3.23k | break; |
2051 | 14 | case HtmlMeta::Classification: |
2052 | 14 | if (i_xDocProps.is()) { |
2053 | 14 | i_xDocProps->setSubject( aContent ); |
2054 | 14 | bChanged = true; |
2055 | 14 | } |
2056 | 14 | break; |
2057 | | |
2058 | 1.89k | case HtmlMeta::ChangedBy: |
2059 | 1.89k | if (i_xDocProps.is()) { |
2060 | 1.89k | i_xDocProps->setModifiedBy( aContent ); |
2061 | 1.89k | bChanged = true; |
2062 | 1.89k | } |
2063 | 1.89k | break; |
2064 | | |
2065 | 4.06k | case HtmlMeta::Created: |
2066 | 15.0k | case HtmlMeta::Changed: |
2067 | 15.0k | if (i_xDocProps.is() && !aContent.isEmpty()) |
2068 | 14.1k | { |
2069 | 14.1k | ::util::DateTime uDT; |
2070 | 14.1k | bool valid = false; |
2071 | 14.1k | if (comphelper::string::getTokenCount(aContent, ';') == 2) |
2072 | 4.76k | { |
2073 | 4.76k | sal_Int32 nIdx{ 0 }; |
2074 | 4.76k | sal_Int32 nDate = o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx)); |
2075 | 4.76k | sal_Int64 nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx)); |
2076 | 4.76k | valid = nDate != std::numeric_limits<sal_Int32>::min() && |
2077 | 4.76k | nTime != std::numeric_limits<sal_Int64>::min(); |
2078 | 4.76k | if (valid) |
2079 | 4.76k | { |
2080 | 4.76k | Date aDate(nDate); |
2081 | 4.76k | tools::Time aTime(tools::Time::fromEncodedTime(nTime)); |
2082 | 4.76k | uDT = DateTime(aDate, aTime).GetUNODateTime(); |
2083 | 4.76k | } |
2084 | 4.76k | } |
2085 | 9.41k | else if (utl::ISO8601parseDateTime(aContent, uDT)) |
2086 | 4.65k | valid = true; |
2087 | | |
2088 | 14.1k | if (valid) |
2089 | 9.41k | { |
2090 | 9.41k | bChanged = true; |
2091 | 9.41k | if (HtmlMeta::Created == nAction) |
2092 | 1.37k | i_xDocProps->setCreationDate(uDT); |
2093 | 8.03k | else |
2094 | 8.03k | i_xDocProps->setModificationDate(uDT); |
2095 | 9.41k | } |
2096 | 14.1k | } |
2097 | 15.0k | break; |
2098 | | |
2099 | 2 | case HtmlMeta::Refresh: |
2100 | 2 | DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." ); |
2101 | 2 | break; |
2102 | | |
2103 | 4.67k | case HtmlMeta::ContentType: |
2104 | 4.67k | if ( !aContent.isEmpty() ) |
2105 | 4.32k | { |
2106 | 4.32k | o_rEnc = GetEncodingByMIME( aContent ); |
2107 | 4.32k | } |
2108 | 4.67k | break; |
2109 | | |
2110 | 32.5k | case HtmlMeta::NONE: |
2111 | 32.5k | if ( !bHTTPEquiv ) |
2112 | 31.3k | { |
2113 | 31.3k | if (i_xDocProps.is()) |
2114 | 31.3k | { |
2115 | 31.3k | uno::Reference<beans::XPropertyContainer> xUDProps |
2116 | 31.3k | = i_xDocProps->getUserDefinedProperties(); |
2117 | 31.3k | try { |
2118 | 31.3k | xUDProps->addProperty(aName, |
2119 | 31.3k | beans::PropertyAttribute::REMOVABLE, |
2120 | 31.3k | uno::Any(aContent)); |
2121 | 31.3k | AddMetaUserDefined(aName); |
2122 | 31.3k | bChanged = true; |
2123 | 31.3k | } catch (uno::Exception &) { |
2124 | | // ignore |
2125 | 23.4k | } |
2126 | 31.3k | } |
2127 | 31.3k | } |
2128 | 32.5k | break; |
2129 | 32.5k | default: |
2130 | 4.12k | break; |
2131 | 63.1k | } |
2132 | | |
2133 | 63.1k | return bChanged; |
2134 | 63.1k | } |
2135 | | |
2136 | | bool HTMLParser::ParseMetaOptions( |
2137 | | const uno::Reference<document::XDocumentProperties> & i_xDocProps, |
2138 | | SvKeyValueIterator *i_pHeader ) |
2139 | 63.1k | { |
2140 | 63.1k | HtmlOptionId nContentOption = HtmlOptionId::CONTENT; |
2141 | 63.1k | rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; |
2142 | | |
2143 | 63.1k | bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader, |
2144 | 63.1k | GetOptions(&nContentOption), |
2145 | 63.1k | eEnc ); |
2146 | | |
2147 | | // If the encoding is set by a META tag, it may only overwrite the |
2148 | | // current encoding if both, the current and the new encoding, are 1-sal_uInt8 |
2149 | | // encodings. Everything else cannot lead to reasonable results. |
2150 | 63.1k | if (RTL_TEXTENCODING_DONTKNOW != eEnc && |
2151 | 3.66k | rtl_isOctetTextEncoding( eEnc ) && |
2152 | 3.66k | rtl_isOctetTextEncoding( GetSrcEncoding() ) ) |
2153 | 3.19k | { |
2154 | 3.19k | eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); |
2155 | 3.19k | SetSrcEncoding( eEnc ); |
2156 | 3.19k | } |
2157 | | |
2158 | 63.1k | return bRet; |
2159 | 63.1k | } |
2160 | | |
2161 | | rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime ) |
2162 | 42.3k | { |
2163 | 42.3k | OUString sType; |
2164 | 42.3k | OUString sSubType; |
2165 | 42.3k | INetContentTypeParameterList aParameters; |
2166 | 42.3k | if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters)) |
2167 | 34.9k | { |
2168 | 34.9k | auto const iter = aParameters.find("charset"_ostr); |
2169 | 34.9k | if (iter != aParameters.end()) |
2170 | 33.8k | { |
2171 | 33.8k | const INetContentTypeParameter * pCharset = &iter->second; |
2172 | 33.8k | OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US)); |
2173 | 33.8k | return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) ); |
2174 | 33.8k | } |
2175 | 34.9k | } |
2176 | 8.51k | return RTL_TEXTENCODING_DONTKNOW; |
2177 | 42.3k | } |
2178 | | |
2179 | | rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader ) |
2180 | 42.7k | { |
2181 | 42.7k | rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; |
2182 | 42.7k | if( pHTTPHeader ) |
2183 | 42.7k | { |
2184 | 42.7k | SvKeyValue aKV; |
2185 | 71.1k | for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont; |
2186 | 42.7k | bCont = pHTTPHeader->GetNext( aKV ) ) |
2187 | 28.3k | { |
2188 | 28.3k | if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) ) |
2189 | 28.3k | { |
2190 | 28.3k | if( !aKV.GetValue().isEmpty() ) |
2191 | 28.3k | { |
2192 | 28.3k | eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() ); |
2193 | 28.3k | } |
2194 | 28.3k | } |
2195 | 28.3k | } |
2196 | 42.7k | } |
2197 | 42.7k | return eRet; |
2198 | 42.7k | } |
2199 | | |
2200 | | bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader ) |
2201 | 42.7k | { |
2202 | 42.7k | bool bRet = false; |
2203 | 42.7k | rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader ); |
2204 | 42.7k | if(RTL_TEXTENCODING_DONTKNOW != eEnc) |
2205 | 28.3k | { |
2206 | 28.3k | SetSrcEncoding( eEnc ); |
2207 | 28.3k | bRet = true; |
2208 | 28.3k | } |
2209 | 42.7k | return bRet; |
2210 | 42.7k | } |
2211 | | |
2212 | | |
2213 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |