/src/libreoffice/sax/source/fastparser/fastparser.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <sax/fastparser.hxx> |
21 | | #include <sax/fastattribs.hxx> |
22 | | #include <utility> |
23 | | #include <xml2utf.hxx> |
24 | | |
25 | | #include <com/sun/star/io/XSeekable.hpp> |
26 | | #include <com/sun/star/lang/DisposedException.hpp> |
27 | | #include <com/sun/star/lang/IllegalArgumentException.hpp> |
28 | | #include <com/sun/star/uno/XComponentContext.hpp> |
29 | | #include <com/sun/star/xml/sax/FastToken.hpp> |
30 | | #include <com/sun/star/xml/sax/SAXParseException.hpp> |
31 | | #include <com/sun/star/xml/sax/XFastContextHandler.hpp> |
32 | | #include <cppuhelper/implbase.hxx> |
33 | | #include <cppuhelper/supportsservice.hxx> |
34 | | #include <cppuhelper/exc_hlp.hxx> |
35 | | #include <osl/conditn.hxx> |
36 | | #include <rtl/ref.hxx> |
37 | | #include <sal/log.hxx> |
38 | | #include <salhelper/thread.hxx> |
39 | | #include <comphelper/diagnose_ex.hxx> |
40 | | #include <o3tl/string_view.hxx> |
41 | | |
42 | | #include <queue> |
43 | | #include <memory> |
44 | | #include <mutex> |
45 | | #include <optional> |
46 | | #include <stack> |
47 | | #include <string_view> |
48 | | #include <unordered_map> |
49 | | #include <vector> |
50 | | #include <cassert> |
51 | | #include <cstring> |
52 | | #include <libxml/parser.h> |
53 | | |
54 | | // Inverse of libxml's BAD_CAST. |
55 | 151M | #define XML_CAST( str ) reinterpret_cast< const char* >( str ) |
56 | | |
57 | | using namespace ::osl; |
58 | | using namespace ::cppu; |
59 | | using namespace ::com::sun::star::uno; |
60 | | using namespace ::com::sun::star::lang; |
61 | | using namespace ::com::sun::star::xml::sax; |
62 | | using namespace ::com::sun::star::io; |
63 | | using namespace com::sun::star; |
64 | | using namespace sax_fastparser; |
65 | | |
66 | | static void NormalizeURI( OUString& rName ); |
67 | | |
68 | | namespace { |
69 | | |
70 | | struct Event; |
71 | | class FastLocatorImpl; |
72 | | struct NamespaceDefine; |
73 | | struct Entity; |
74 | | |
75 | | typedef std::unordered_map< OUString, sal_Int32 > NamespaceMap; |
76 | | |
77 | | struct EventList |
78 | | { |
79 | | std::vector<Event> maEvents; |
80 | | bool mbIsAttributesEmpty; |
81 | | }; |
82 | | |
83 | | enum class CallbackType { START_ELEMENT, END_ELEMENT, CHARACTERS, PROCESSING_INSTRUCTION, DONE, EXCEPTION }; |
84 | | |
85 | | struct Event |
86 | | { |
87 | | CallbackType maType; |
88 | | sal_Int32 mnElementToken; |
89 | | OUString msNamespace; |
90 | | OUString msElementName; |
91 | | rtl::Reference< FastAttributeList > mxAttributes; |
92 | | rtl::Reference< FastAttributeList > mxDeclAttributes; |
93 | | OUString msChars; |
94 | | }; |
95 | | |
96 | | struct NameWithToken |
97 | | { |
98 | | OUString msName; |
99 | | sal_Int32 mnToken; |
100 | | |
101 | | NameWithToken(OUString sName, sal_Int32 nToken) : |
102 | 37.5M | msName(std::move(sName)), mnToken(nToken) {} |
103 | | }; |
104 | | |
105 | | struct SaxContext |
106 | | { |
107 | | Reference< XFastContextHandler > mxContext; |
108 | | sal_Int32 mnElementToken; |
109 | | std::optional<OUString> moNamespace; |
110 | | std::optional<OUString> moElementName; |
111 | | |
112 | | SaxContext( sal_Int32 nElementToken, const OUString& aNamespace, const OUString& aElementName ): |
113 | 37.5M | mnElementToken(nElementToken) |
114 | 37.5M | { |
115 | 37.5M | if (nElementToken == FastToken::DONTKNOW) |
116 | 19.4M | { |
117 | 19.4M | moNamespace = aNamespace; |
118 | 19.4M | moElementName = aElementName; |
119 | 19.4M | } |
120 | 37.5M | } |
121 | | }; |
122 | | |
123 | | struct ParserData |
124 | | { |
125 | | css::uno::Reference< css::xml::sax::XFastDocumentHandler > mxDocumentHandler; |
126 | | rtl::Reference<FastTokenHandlerBase> mxTokenHandler; |
127 | | css::uno::Reference< css::xml::sax::XErrorHandler > mxErrorHandler; |
128 | | css::uno::Reference< css::xml::sax::XFastNamespaceHandler >mxNamespaceHandler; |
129 | | |
130 | | ParserData(); |
131 | | }; |
132 | | |
133 | | struct NamespaceDefine |
134 | | { |
135 | | OString maPrefix; |
136 | | sal_Int32 mnToken; |
137 | | OUString maNamespaceURL; |
138 | | |
139 | | NamespaceDefine( OString aPrefix, sal_Int32 nToken, OUString aNamespaceURL ) |
140 | 1.59M | : maPrefix(std::move( aPrefix )), mnToken( nToken ), maNamespaceURL(std::move( aNamespaceURL )) {} |
141 | 14.5M | NamespaceDefine() : mnToken(-1) {} |
142 | | }; |
143 | | |
144 | | // Entity binds all information needed for a single file | single call of parseStream |
145 | | struct Entity : public ParserData |
146 | | { |
147 | | // Amount of work producer sends to consumer in one iteration: |
148 | | static const size_t mnEventListSize = 1000; |
149 | | |
150 | | // unique for each Entity instance: |
151 | | |
152 | | // Number of valid events in mxProducedEvents: |
153 | | size_t mnProducedEventsSize; |
154 | | std::optional<EventList> mxProducedEvents; |
155 | | std::queue<EventList> maPendingEvents; |
156 | | std::queue<EventList> maUsedEvents; |
157 | | std::mutex maEventProtector; |
158 | | |
159 | | static const size_t mnEventLowWater = 4; |
160 | | static const size_t mnEventHighWater = 8; |
161 | | osl::Condition maConsumeResume; |
162 | | osl::Condition maProduceResume; |
163 | | // Event we use to store data if threading is disabled: |
164 | | Event maSharedEvent; |
165 | | |
166 | | // copied in copy constructor: |
167 | | |
168 | | // Allow to disable threading for small documents: |
169 | | bool mbEnableThreads; |
170 | | css::xml::sax::InputSource maStructSource; |
171 | | xmlParserCtxtPtr mpParser; |
172 | | ::sax_expatwrap::XMLFile2UTFConverter maConverter; |
173 | | |
174 | | // Exceptions cannot be thrown through the C-XmlParser (possible |
175 | | // resource leaks), therefore any exception thrown by a UNO callback |
176 | | // must be saved somewhere until the C-XmlParser is stopped. |
177 | | css::uno::Any maSavedException; |
178 | | std::mutex maSavedExceptionMutex; |
179 | | void saveException( const Any & e ); |
180 | | // Thread-safe check if maSavedException has value |
181 | | bool hasException(); |
182 | | void throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator, |
183 | | bool mbDuringParse ); |
184 | | |
185 | | std::stack< NameWithToken, std::vector<NameWithToken> > maNamespaceStack; |
186 | | /* Context for main thread consuming events. |
187 | | * startElement() stores the data, which characters() and endElement() uses |
188 | | */ |
189 | | std::stack< SaxContext, std::vector<SaxContext> > maContextStack; |
190 | | // Determines which elements of maNamespaceDefines are valid in current context |
191 | | std::stack< sal_uInt32, std::vector<sal_uInt32> > maNamespaceCount; |
192 | | std::vector< NamespaceDefine > maNamespaceDefines; |
193 | | |
194 | | explicit Entity( const ParserData& rData ); |
195 | | Entity( const Entity& rEntity ) = delete; |
196 | | Entity& operator=( const Entity& rEntity ) = delete; |
197 | | void startElement( Event const *pEvent ); |
198 | | void characters( const OUString& sChars ); |
199 | | void endElement(); |
200 | | void processingInstruction( const OUString& rTarget, const OUString& rData ); |
201 | | void transferUsedEvents(); |
202 | | EventList& getEventList(); |
203 | | Event& getEvent( CallbackType aType ); |
204 | | }; |
205 | | |
206 | | // Stuff for custom entity names |
207 | | struct ReplacementPair |
208 | | { |
209 | | OUString name; |
210 | | OUString replacement; |
211 | | }; |
212 | | inline bool operator<(const ReplacementPair& lhs, const ReplacementPair& rhs) |
213 | 0 | { |
214 | 0 | return lhs.name < rhs.name; |
215 | 0 | } |
216 | | inline bool operator<(const ReplacementPair& lhs, const char* rhs) |
217 | 0 | { |
218 | 0 | return lhs.name.compareToAscii(rhs) < 0; |
219 | 0 | } |
220 | | |
221 | | } // namespace |
222 | | |
223 | | namespace sax_fastparser { |
224 | | |
225 | | class FastSaxParserImpl |
226 | | { |
227 | | public: |
228 | | explicit FastSaxParserImpl(); |
229 | | ~FastSaxParserImpl(); |
230 | | |
231 | | private: |
232 | | std::vector<ReplacementPair> m_Replacements; |
233 | | std::vector<xmlEntityPtr> m_TemporalEntities; |
234 | | |
235 | | public: |
236 | | // XFastParser |
237 | | /// @throws css::xml::sax::SAXException |
238 | | /// @throws css::io::IOException |
239 | | /// @throws css::uno::RuntimeException |
240 | | void parseStream( const css::xml::sax::InputSource& aInputSource ); |
241 | | /// @throws css::uno::RuntimeException |
242 | | void setFastDocumentHandler( const css::uno::Reference< css::xml::sax::XFastDocumentHandler >& Handler ); |
243 | | /// @throws css::uno::RuntimeException |
244 | | void setTokenHandler( const css::uno::Reference< css::xml::sax::XFastTokenHandler >& Handler ); |
245 | | /// @throws css::lang::IllegalArgumentException |
246 | | /// @throws css::uno::RuntimeException |
247 | | void registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken ); |
248 | | /// @throws css::lang::IllegalArgumentException |
249 | | /// @throws css::uno::RuntimeException |
250 | | OUString const & getNamespaceURL( std::u16string_view rPrefix ); |
251 | | /// @throws css::uno::RuntimeException |
252 | | void setErrorHandler( const css::uno::Reference< css::xml::sax::XErrorHandler >& Handler ); |
253 | | /// @throws css::uno::RuntimeException |
254 | | void setNamespaceHandler( const css::uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler); |
255 | | // Fake DTD file |
256 | | void setCustomEntityNames( |
257 | | const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements); |
258 | | |
259 | | // called by the C callbacks of the expat parser |
260 | | void callbackStartElement( const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI, |
261 | | int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes ); |
262 | | void callbackEndElement(); |
263 | | void callbackCharacters( const xmlChar* s, int nLen ); |
264 | | void callbackProcessingInstruction( const xmlChar *target, const xmlChar *data ); |
265 | | xmlEntityPtr callbackGetEntity( const xmlChar *name ); |
266 | | |
267 | | void pushEntity(const ParserData&, xml::sax::InputSource const&); |
268 | | void popEntity(); |
269 | 149M | Entity& getEntity() { return *mpTop; } |
270 | | void parse(); |
271 | | void produce( bool bForceFlush = false ); |
272 | | bool m_bIgnoreMissingNSDecl; |
273 | | bool m_bDisableThreadedParser; |
274 | | |
275 | | private: |
276 | | bool consume(EventList&); |
277 | | void deleteUsedEvents(); |
278 | | void sendPendingCharacters(); |
279 | | void addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes); |
280 | | |
281 | | sal_Int32 GetToken( const xmlChar* pName ); |
282 | | /// @throws css::xml::sax::SAXException |
283 | | sal_Int32 GetTokenWithPrefix( std::string_view sPrefix, const xmlChar* pName ); |
284 | | /// @throws css::xml::sax::SAXException |
285 | | OUString const & GetNamespaceURL( std::string_view rPrefix ); |
286 | | sal_Int32 GetNamespaceToken( const OUString& rNamespaceURL ); |
287 | | sal_Int32 GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName ); |
288 | | void DefineNamespace( const OString& rPrefix, const OUString& namespaceURL ); |
289 | | |
290 | | private: |
291 | | std::mutex maMutex; ///< Protecting whole parseStream() execution |
292 | | ::rtl::Reference< FastLocatorImpl > mxDocumentLocator; |
293 | | NamespaceMap maNamespaceMap; |
294 | | |
295 | | ParserData maData; /// Cached parser configuration for next call of parseStream(). |
296 | | |
297 | | Entity *mpTop; /// std::stack::top() is amazingly slow => cache this. |
298 | | std::stack< Entity > maEntities; /// Entity stack for each call of parseStream(). |
299 | | std::vector<char> pendingCharacters; /// Data from characters() callback that needs to be sent. |
300 | | }; |
301 | | |
302 | | } // namespace sax_fastparser |
303 | | |
304 | | namespace { |
305 | | |
306 | | class ParserThread: public salhelper::Thread |
307 | | { |
308 | | FastSaxParserImpl *mpParser; |
309 | | public: |
310 | 0 | explicit ParserThread(FastSaxParserImpl *pParser): Thread("Parser"), mpParser(pParser) {} |
311 | | private: |
312 | | virtual void execute() override |
313 | 0 | { |
314 | 0 | try |
315 | 0 | { |
316 | 0 | mpParser->parse(); |
317 | 0 | } |
318 | 0 | catch (...) |
319 | 0 | { |
320 | 0 | Entity &rEntity = mpParser->getEntity(); |
321 | 0 | rEntity.getEvent( CallbackType::EXCEPTION ); |
322 | 0 | mpParser->produce( true ); |
323 | 0 | } |
324 | 0 | } |
325 | | }; |
326 | | |
327 | | extern "C" { |
328 | | |
329 | | static void call_callbackStartElement(void *userData, const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI, |
330 | | int numNamespaces, const xmlChar** namespaces, int numAttributes, int /*defaultedAttributes*/, const xmlChar **attributes) |
331 | 37.5M | { |
332 | 37.5M | FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData ); |
333 | 37.5M | pFastParser->callbackStartElement( localName, prefix, URI, numNamespaces, namespaces, numAttributes, attributes ); |
334 | 37.5M | } |
335 | | |
336 | | static void call_callbackEndElement(void *userData, const xmlChar* /*localName*/, const xmlChar* /*prefix*/, const xmlChar* /*URI*/) |
337 | 17.4M | { |
338 | 17.4M | FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData ); |
339 | 17.4M | pFastParser->callbackEndElement(); |
340 | 17.4M | } |
341 | | |
342 | | static void call_callbackCharacters( void *userData , const xmlChar *s , int nLen ) |
343 | 14.4M | { |
344 | 14.4M | FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData ); |
345 | 14.4M | pFastParser->callbackCharacters( s, nLen ); |
346 | 14.4M | } |
347 | | |
348 | | static void call_callbackProcessingInstruction( void *userData, const xmlChar *target, const xmlChar *data ) |
349 | 71.2k | { |
350 | 71.2k | FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData ); |
351 | 71.2k | pFastParser->callbackProcessingInstruction( target, data ); |
352 | 71.2k | } |
353 | | |
354 | | static xmlEntityPtr call_callbackGetEntity( void *userData, const xmlChar *name) |
355 | 70.1k | { |
356 | 70.1k | FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData ); |
357 | 70.1k | return pFastParser->callbackGetEntity( name ); |
358 | 70.1k | } |
359 | | |
360 | | } |
361 | | |
362 | | class FastLocatorImpl : public WeakImplHelper< XLocator > |
363 | | { |
364 | | public: |
365 | 372k | explicit FastLocatorImpl(FastSaxParserImpl *p) : mpParser(p) {} |
366 | | |
367 | 372k | void dispose() { mpParser = nullptr; } |
368 | | /// @throws RuntimeException |
369 | 1.41M | void checkDispose() const { if( !mpParser ) throw DisposedException(); } |
370 | | |
371 | | //XLocator |
372 | | virtual sal_Int32 SAL_CALL getColumnNumber() override; |
373 | | virtual sal_Int32 SAL_CALL getLineNumber() override; |
374 | | virtual OUString SAL_CALL getPublicId() override; |
375 | | virtual OUString SAL_CALL getSystemId() override; |
376 | | |
377 | | private: |
378 | | FastSaxParserImpl *mpParser; |
379 | | }; |
380 | | |
381 | | sal_Int32 SAL_CALL FastLocatorImpl::getColumnNumber() |
382 | 298k | { |
383 | 298k | checkDispose(); |
384 | 298k | return xmlSAX2GetColumnNumber( mpParser->getEntity().mpParser ); |
385 | 298k | } |
386 | | |
387 | | sal_Int32 SAL_CALL FastLocatorImpl::getLineNumber() |
388 | 407k | { |
389 | 407k | checkDispose(); |
390 | 407k | return xmlSAX2GetLineNumber( mpParser->getEntity().mpParser ); |
391 | 407k | } |
392 | | |
393 | | OUString SAL_CALL FastLocatorImpl::getPublicId() |
394 | 298k | { |
395 | 298k | checkDispose(); |
396 | 298k | return mpParser->getEntity().maStructSource.sPublicId; |
397 | 298k | } |
398 | | |
399 | | OUString SAL_CALL FastLocatorImpl::getSystemId() |
400 | 407k | { |
401 | 407k | checkDispose(); |
402 | 407k | return mpParser->getEntity().maStructSource.sSystemId; |
403 | 407k | } |
404 | | |
405 | | ParserData::ParserData() |
406 | 372k | {} |
407 | | |
408 | | Entity::Entity(const ParserData& rData) |
409 | 250k | : ParserData(rData) |
410 | 250k | , mnProducedEventsSize(0) |
411 | 250k | , mbEnableThreads(false) |
412 | 250k | , mpParser(nullptr) |
413 | 250k | { |
414 | 250k | } |
415 | | |
416 | | void Entity::startElement( Event const *pEvent ) |
417 | 37.5M | { |
418 | 37.5M | const sal_Int32& nElementToken = pEvent->mnElementToken; |
419 | 37.5M | const OUString& aNamespace = pEvent->msNamespace; |
420 | 37.5M | const OUString& aElementName = pEvent->msElementName; |
421 | | |
422 | | // Use un-wrapped pointers to avoid significant acquire/release overhead |
423 | 37.5M | XFastContextHandler *pParentContext = nullptr; |
424 | 37.5M | if( !maContextStack.empty() ) |
425 | 37.3M | { |
426 | 37.3M | pParentContext = maContextStack.top().mxContext.get(); |
427 | 37.3M | if( !pParentContext ) |
428 | 151k | { |
429 | 151k | maContextStack.push( SaxContext(nElementToken, aNamespace, aElementName) ); |
430 | 151k | return; |
431 | 151k | } |
432 | 37.3M | } |
433 | | |
434 | 37.4M | maContextStack.push( SaxContext( nElementToken, aNamespace, aElementName ) ); |
435 | | |
436 | 37.4M | try |
437 | 37.4M | { |
438 | 37.4M | const Reference< XFastAttributeList > xAttr( pEvent->mxAttributes ); |
439 | 37.4M | Reference< XFastContextHandler > xContext; |
440 | | |
441 | 37.4M | if ( mxNamespaceHandler.is() ) |
442 | 25.3M | { |
443 | 25.3M | const Sequence< xml::Attribute > NSDeclAttribs = pEvent->mxDeclAttributes->getUnknownAttributes(); |
444 | 25.3M | for (const auto& rNSDeclAttrib : NSDeclAttribs) |
445 | 1.32M | { |
446 | 1.32M | mxNamespaceHandler->registerNamespace( rNSDeclAttrib.Name, rNSDeclAttrib.Value ); |
447 | 1.32M | } |
448 | 25.3M | } |
449 | | |
450 | 37.4M | if( nElementToken == FastToken::DONTKNOW ) |
451 | 19.3M | { |
452 | 19.3M | if( pParentContext ) |
453 | 19.3M | xContext = pParentContext->createUnknownChildContext( aNamespace, aElementName, xAttr ); |
454 | 44.5k | else if( mxDocumentHandler.is() ) |
455 | 44.5k | xContext = mxDocumentHandler->createUnknownChildContext( aNamespace, aElementName, xAttr ); |
456 | | |
457 | 19.3M | if( xContext.is() ) |
458 | 19.3M | { |
459 | 19.3M | xContext->startUnknownElement( aNamespace, aElementName, xAttr ); |
460 | 19.3M | } |
461 | 19.3M | } |
462 | 18.0M | else |
463 | 18.0M | { |
464 | 18.0M | if( pParentContext ) |
465 | 17.8M | xContext = pParentContext->createFastChildContext( nElementToken, xAttr ); |
466 | 182k | else if( mxDocumentHandler.is() ) |
467 | 182k | xContext = mxDocumentHandler->createFastChildContext( nElementToken, xAttr ); |
468 | | |
469 | 18.0M | if( xContext.is() ) |
470 | 16.6M | xContext->startFastElement( nElementToken, xAttr ); |
471 | 18.0M | } |
472 | | // swap the reference we own in to avoid referencing thrash. |
473 | 37.4M | maContextStack.top().mxContext = std::move( xContext ); |
474 | 37.4M | } |
475 | 37.4M | catch (...) |
476 | 37.4M | { |
477 | 7.81k | saveException( ::cppu::getCaughtException() ); |
478 | 7.81k | } |
479 | 37.4M | } |
480 | | |
481 | | void Entity::characters( const OUString& sChars ) |
482 | 12.7M | { |
483 | 12.7M | if (maContextStack.empty()) |
484 | 57 | { |
485 | | // Malformed XML stream !? |
486 | 57 | return; |
487 | 57 | } |
488 | | |
489 | 12.7M | XFastContextHandler * pContext( maContextStack.top().mxContext.get() ); |
490 | 12.7M | if( pContext ) try |
491 | 12.7M | { |
492 | 12.7M | pContext->characters( sChars ); |
493 | 12.7M | } |
494 | 12.7M | catch (...) |
495 | 12.7M | { |
496 | 0 | saveException( ::cppu::getCaughtException() ); |
497 | 0 | } |
498 | 12.7M | } |
499 | | |
500 | | void Entity::endElement() |
501 | 17.4M | { |
502 | 17.4M | if (maContextStack.empty()) |
503 | 597 | { |
504 | | // Malformed XML stream !? |
505 | 597 | return; |
506 | 597 | } |
507 | | |
508 | 17.4M | const SaxContext& aContext = maContextStack.top(); |
509 | 17.4M | XFastContextHandler* pContext( aContext.mxContext.get() ); |
510 | 17.4M | if( pContext ) |
511 | 15.9M | try |
512 | 15.9M | { |
513 | 15.9M | sal_Int32 nElementToken = aContext.mnElementToken; |
514 | 15.9M | if( nElementToken != FastToken::DONTKNOW ) |
515 | 14.8M | pContext->endFastElement( nElementToken ); |
516 | 1.15M | else |
517 | 1.15M | pContext->endUnknownElement( *aContext.moNamespace, *aContext.moElementName ); |
518 | 15.9M | } |
519 | 15.9M | catch (...) |
520 | 15.9M | { |
521 | 64.3k | saveException( ::cppu::getCaughtException() ); |
522 | 64.3k | } |
523 | 17.4M | maContextStack.pop(); |
524 | 17.4M | } |
525 | | |
526 | | void Entity::processingInstruction( const OUString& rTarget, const OUString& rData ) |
527 | 71.2k | { |
528 | 71.2k | if( mxDocumentHandler.is() ) try |
529 | 71.2k | { |
530 | 71.2k | mxDocumentHandler->processingInstruction( rTarget, rData ); |
531 | 71.2k | } |
532 | 71.2k | catch (...) |
533 | 71.2k | { |
534 | 0 | saveException( ::cppu::getCaughtException() ); |
535 | 0 | } |
536 | 71.2k | } |
537 | | |
538 | | void Entity::transferUsedEvents() |
539 | 0 | { |
540 | 0 | std::unique_lock aGuard(maEventProtector); |
541 | 0 | if (!maUsedEvents.empty()) |
542 | 0 | { |
543 | 0 | mxProducedEvents = std::move(maUsedEvents.front()); |
544 | 0 | maUsedEvents.pop(); |
545 | 0 | aGuard.unlock(); // unlock |
546 | 0 | mnProducedEventsSize = 0; |
547 | 0 | } |
548 | 0 | } |
549 | | |
550 | | EventList& Entity::getEventList() |
551 | 0 | { |
552 | 0 | if (!mxProducedEvents) |
553 | 0 | { |
554 | 0 | transferUsedEvents(); |
555 | 0 | if (!mxProducedEvents) |
556 | 0 | { |
557 | 0 | mxProducedEvents.emplace(); |
558 | 0 | mxProducedEvents->maEvents.resize(mnEventListSize); |
559 | 0 | mxProducedEvents->mbIsAttributesEmpty = false; |
560 | 0 | mnProducedEventsSize = 0; |
561 | 0 | } |
562 | 0 | } |
563 | 0 | return *mxProducedEvents; |
564 | 0 | } |
565 | | |
566 | | Event& Entity::getEvent( CallbackType aType ) |
567 | 55.2M | { |
568 | 55.2M | if (!mbEnableThreads) |
569 | 55.2M | return maSharedEvent; |
570 | | |
571 | 19 | EventList& rEventList = getEventList(); |
572 | 19 | if (mnProducedEventsSize == rEventList.maEvents.size()) |
573 | 0 | { |
574 | 0 | SAL_WARN_IF(!maSavedException.hasValue(), "sax", |
575 | 0 | "Event vector should only exceed " << mnEventListSize << |
576 | 0 | " temporarily while an exception is pending"); |
577 | 0 | rEventList.maEvents.resize(mnProducedEventsSize + 1); |
578 | 0 | } |
579 | 19 | Event& rEvent = rEventList.maEvents[mnProducedEventsSize++]; |
580 | 19 | rEvent.maType = aType; |
581 | 19 | return rEvent; |
582 | 19 | } |
583 | | |
584 | | OUString lclGetErrorMessage( xmlParserCtxtPtr ctxt, std::u16string_view sSystemId, sal_Int32 nLine ) |
585 | 108k | { |
586 | 108k | const char* pMessage; |
587 | 108k | const xmlError* error = xmlCtxtGetLastError( ctxt ); |
588 | 108k | if( error && error->message ) |
589 | 107k | pMessage = error->message; |
590 | 1.40k | else |
591 | 1.40k | pMessage = "unknown error"; |
592 | 108k | return OUString::Concat("[") + sSystemId + " line " + OUString::number(nLine) + "]: " + |
593 | 108k | OUString(pMessage, strlen(pMessage), RTL_TEXTENCODING_ASCII_US); |
594 | 108k | } |
595 | | |
596 | | // throw an exception, but avoid callback if |
597 | | // during a threaded produce |
598 | | void Entity::throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator, |
599 | | bool mbDuringParse ) |
600 | 108k | { |
601 | | // Error during parsing ! |
602 | 108k | Any savedException; |
603 | 108k | { |
604 | 108k | std::scoped_lock g(maSavedExceptionMutex); |
605 | 108k | if (maSavedException.hasValue()) |
606 | 5.20k | { |
607 | 5.20k | savedException.setValue(&maSavedException, cppu::UnoType<decltype(maSavedException)>::get()); |
608 | 5.20k | } |
609 | 108k | } |
610 | 108k | SAXParseException aExcept( |
611 | 108k | lclGetErrorMessage( mpParser, |
612 | 108k | xDocumentLocator->getSystemId(), |
613 | 108k | xDocumentLocator->getLineNumber() ), |
614 | 108k | Reference< XInterface >(), |
615 | 108k | savedException, |
616 | 108k | xDocumentLocator->getPublicId(), |
617 | 108k | xDocumentLocator->getSystemId(), |
618 | 108k | xDocumentLocator->getLineNumber(), |
619 | 108k | xDocumentLocator->getColumnNumber() |
620 | 108k | ); |
621 | | |
622 | | // error handler is set, it may throw the exception |
623 | 108k | if( !mbDuringParse || !mbEnableThreads ) |
624 | 108k | { |
625 | 108k | if (mxErrorHandler.is() ) |
626 | 0 | mxErrorHandler->fatalError( Any( aExcept ) ); |
627 | 108k | } |
628 | | |
629 | | // error handler has not thrown, but parsing must stop => throw ourselves |
630 | 108k | throw aExcept; |
631 | 108k | } |
632 | | |
633 | | // In the single threaded case we emit events via our C |
634 | | // callbacks, so any exception caught must be queued up until |
635 | | // we can safely re-throw it from our C++ parent of parse() |
636 | | |
637 | | // If multi-threaded, we need to push an EXCEPTION event, at |
638 | | // which point we transfer ownership of maSavedException to |
639 | | // the consuming thread. |
640 | | void Entity::saveException( const Any & e ) |
641 | 73.1k | { |
642 | | // fdo#81214 - allow the parser to run on after an exception, |
643 | | // unexpectedly some 'startElements' produce a UNO_QUERY_THROW |
644 | | // for XComponent; and yet expect to continue parsing. |
645 | 73.1k | SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e)); |
646 | 73.1k | std::scoped_lock g(maSavedExceptionMutex); |
647 | 73.1k | if (maSavedException.hasValue()) |
648 | 67.9k | { |
649 | 67.9k | SAL_INFO("sax.fastparser", "discarding exception, already have one"); |
650 | 67.9k | } |
651 | 5.20k | else |
652 | 5.20k | { |
653 | 5.20k | maSavedException = e; |
654 | 5.20k | } |
655 | 73.1k | } |
656 | | |
657 | | bool Entity::hasException() |
658 | 408k | { |
659 | 408k | std::scoped_lock g(maSavedExceptionMutex); |
660 | 408k | return maSavedException.hasValue(); |
661 | 408k | } |
662 | | |
663 | | } // namespace |
664 | | |
665 | | namespace sax_fastparser { |
666 | | |
667 | | FastSaxParserImpl::FastSaxParserImpl() : |
668 | 372k | m_bIgnoreMissingNSDecl(false), |
669 | 372k | m_bDisableThreadedParser(false), |
670 | 372k | mpTop(nullptr) |
671 | 372k | { |
672 | 372k | mxDocumentLocator.set( new FastLocatorImpl( this ) ); |
673 | 372k | } |
674 | | |
675 | | FastSaxParserImpl::~FastSaxParserImpl() |
676 | 372k | { |
677 | 372k | if( mxDocumentLocator.is() ) |
678 | 372k | mxDocumentLocator->dispose(); |
679 | 372k | for (auto& entity : m_TemporalEntities) |
680 | 0 | { |
681 | 0 | if (!entity) |
682 | 0 | continue; |
683 | 0 | xmlNodePtr pPtr = reinterpret_cast<xmlNodePtr>(entity); |
684 | 0 | xmlUnlinkNode(pPtr); |
685 | 0 | xmlFreeNode(pPtr); |
686 | 0 | } |
687 | 372k | } |
688 | | |
689 | | void FastSaxParserImpl::DefineNamespace( const OString& rPrefix, const OUString& namespaceURL ) |
690 | 1.59M | { |
691 | 1.59M | Entity& rEntity = getEntity(); |
692 | 1.59M | assert(!rEntity.maNamespaceCount.empty()); // need a context! |
693 | | |
694 | 1.59M | sal_uInt32 nOffset = rEntity.maNamespaceCount.top()++; |
695 | 1.59M | if( rEntity.maNamespaceDefines.size() <= nOffset ) |
696 | 226k | rEntity.maNamespaceDefines.resize( rEntity.maNamespaceDefines.size() + 64 ); |
697 | | |
698 | 1.59M | rEntity.maNamespaceDefines[nOffset] = NamespaceDefine( rPrefix, GetNamespaceToken( namespaceURL ), namespaceURL ); |
699 | 1.59M | } |
700 | | |
701 | | sal_Int32 FastSaxParserImpl::GetToken(const xmlChar* pName) |
702 | 57.0M | { |
703 | 57.0M | return FastTokenHandlerBase::getTokenFromChars( getEntity(). mxTokenHandler.get(), |
704 | 57.0M | XML_CAST( pName ) ); // uses utf-8 |
705 | 57.0M | } |
706 | | |
707 | | sal_Int32 FastSaxParserImpl::GetTokenWithPrefix( std::string_view sPrefix, const xmlChar* pName ) |
708 | 21.1M | { |
709 | 21.1M | Entity& rEntity = getEntity(); |
710 | 21.1M | if (rEntity.maNamespaceCount.empty()) |
711 | 0 | return FastToken::DONTKNOW; |
712 | | |
713 | 21.1M | sal_uInt32 nNamespace = rEntity.maNamespaceCount.top(); |
714 | 193M | while( nNamespace-- ) |
715 | 192M | { |
716 | 192M | const auto & rNamespaceDefine = rEntity.maNamespaceDefines[nNamespace]; |
717 | 192M | if( rNamespaceDefine.maPrefix == sPrefix ) |
718 | 19.8M | return GetTokenWithContextNamespace(rNamespaceDefine.mnToken, pName); |
719 | 192M | } |
720 | | |
721 | 1.32M | if (!m_bIgnoreMissingNSDecl) |
722 | 964 | throw SAXException("No namespace defined for " + OStringToOUString(sPrefix, |
723 | 964 | RTL_TEXTENCODING_UTF8), {}, {}); |
724 | | |
725 | 1.32M | return FastToken::DONTKNOW; |
726 | 1.32M | } |
727 | | |
728 | | sal_Int32 FastSaxParserImpl::GetNamespaceToken( const OUString& rNamespaceURL ) |
729 | 25.0M | { |
730 | 25.0M | NamespaceMap::iterator aIter( maNamespaceMap.find( rNamespaceURL ) ); |
731 | 25.0M | if( aIter != maNamespaceMap.end() ) |
732 | 1.18M | return (*aIter).second; |
733 | 23.8M | else |
734 | 23.8M | return FastToken::DONTKNOW; |
735 | 25.0M | } |
736 | | |
737 | | OUString const & FastSaxParserImpl::GetNamespaceURL( std::string_view rPrefix ) |
738 | 0 | { |
739 | 0 | Entity& rEntity = getEntity(); |
740 | 0 | if( !rEntity.maNamespaceCount.empty() ) |
741 | 0 | { |
742 | 0 | sal_uInt32 nNamespace = rEntity.maNamespaceCount.top(); |
743 | 0 | while( nNamespace-- ) |
744 | 0 | if( rEntity.maNamespaceDefines[nNamespace].maPrefix == rPrefix ) |
745 | 0 | return rEntity.maNamespaceDefines[nNamespace].maNamespaceURL; |
746 | 0 | } |
747 | | |
748 | 0 | throw SAXException("No namespace defined for " + OUString::fromUtf8(rPrefix), |
749 | 0 | Reference< XInterface >(), Any()); |
750 | 0 | } |
751 | | |
752 | | sal_Int32 FastSaxParserImpl::GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName ) |
753 | 25.6M | { |
754 | 25.6M | if( nNamespaceToken != FastToken::DONTKNOW ) |
755 | 24.7M | { |
756 | 24.7M | sal_Int32 nNameToken = GetToken( pName ); |
757 | 24.7M | if( nNameToken != FastToken::DONTKNOW ) |
758 | 23.3M | return nNamespaceToken | nNameToken; |
759 | 24.7M | } |
760 | | |
761 | 2.34M | return FastToken::DONTKNOW; |
762 | 25.6M | } |
763 | | |
764 | | namespace |
765 | | { |
766 | | class ParserCleanup |
767 | | { |
768 | | private: |
769 | | FastSaxParserImpl& m_rParser; |
770 | | Entity& m_rEntity; |
771 | | rtl::Reference<ParserThread> m_xParser; |
772 | | public: |
773 | | ParserCleanup(FastSaxParserImpl& rParser, Entity& rEntity) |
774 | 250k | : m_rParser(rParser) |
775 | 250k | , m_rEntity(rEntity) |
776 | 250k | { |
777 | 250k | } |
778 | | ~ParserCleanup() |
779 | 250k | { |
780 | 250k | if (m_rEntity.mpParser) |
781 | 244k | { |
782 | 244k | if (m_rEntity.mpParser->myDoc) |
783 | 1.91k | xmlFreeDoc(m_rEntity.mpParser->myDoc); |
784 | 244k | xmlFreeParserCtxt(m_rEntity.mpParser); |
785 | 244k | } |
786 | 250k | joinThread(); |
787 | 250k | m_rParser.popEntity(); |
788 | 250k | } |
789 | | void setThread(const rtl::Reference<ParserThread> &xParser) |
790 | 0 | { |
791 | 0 | m_xParser = xParser; |
792 | 0 | } |
793 | | void joinThread() |
794 | 250k | { |
795 | 250k | if (m_xParser.is()) |
796 | 0 | { |
797 | 0 | rtl::Reference<ParserThread> xToJoin = m_xParser; |
798 | 0 | m_xParser.clear(); |
799 | 0 | xToJoin->join(); |
800 | 0 | } |
801 | 250k | } |
802 | | }; |
803 | | } |
804 | | /*************** |
805 | | * |
806 | | * parseStream does Parser-startup initializations. The FastSaxParser::parse() method does |
807 | | * the file-specific initialization work. (During a parser run, external files may be opened) |
808 | | * |
809 | | ****************/ |
810 | | void FastSaxParserImpl::parseStream(const InputSource& rStructSource) |
811 | 274k | { |
812 | 274k | xmlInitParser(); |
813 | | |
814 | | // Only one text at one time |
815 | 274k | std::unique_lock guard( maMutex ); |
816 | | |
817 | 274k | pushEntity(maData, rStructSource); |
818 | 274k | Entity& rEntity = getEntity(); |
819 | 274k | ParserCleanup aEnsureFree(*this, rEntity); |
820 | | |
821 | | // start the document |
822 | 274k | if( rEntity.mxDocumentHandler.is() ) |
823 | 250k | { |
824 | 250k | rEntity.mxDocumentHandler->setDocumentLocator( mxDocumentLocator ); |
825 | 250k | rEntity.mxDocumentHandler->startDocument(); |
826 | 250k | } |
827 | | |
828 | | #ifdef EMSCRIPTEN |
829 | | rEntity.mbEnableThreads = false; |
830 | | #else |
831 | 274k | if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser) |
832 | 0 | { |
833 | 0 | Reference<css::io::XSeekable> xSeekable(rEntity.maStructSource.aInputStream, UNO_QUERY); |
834 | | // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams |
835 | 0 | rEntity.mbEnableThreads = (xSeekable.is() && xSeekable->getLength() > 10000) |
836 | 0 | || (rEntity.maStructSource.aInputStream->available() > 10000); |
837 | 0 | } |
838 | 274k | #endif |
839 | | |
840 | 274k | if (rEntity.mbEnableThreads) |
841 | 0 | { |
842 | 0 | rtl::Reference<ParserThread> xParser = new ParserThread(this); |
843 | 0 | xParser->launch(); |
844 | 0 | aEnsureFree.setThread(xParser); |
845 | 0 | bool done = false; |
846 | 0 | do { |
847 | 0 | rEntity.maConsumeResume.wait(); |
848 | 0 | rEntity.maConsumeResume.reset(); |
849 | |
|
850 | 0 | std::unique_lock aGuard(rEntity.maEventProtector); |
851 | 0 | while (!rEntity.maPendingEvents.empty()) |
852 | 0 | { |
853 | 0 | if (rEntity.maPendingEvents.size() <= Entity::mnEventLowWater) |
854 | 0 | rEntity.maProduceResume.set(); // start producer again |
855 | |
|
856 | 0 | EventList aEventList = std::move(rEntity.maPendingEvents.front()); |
857 | 0 | rEntity.maPendingEvents.pop(); |
858 | 0 | aGuard.unlock(); // unlock |
859 | |
|
860 | 0 | if (!consume(aEventList)) |
861 | 0 | done = true; |
862 | |
|
863 | 0 | aGuard.lock(); // lock |
864 | |
|
865 | 0 | if ( rEntity.maPendingEvents.size() <= Entity::mnEventLowWater ) |
866 | 0 | { |
867 | 0 | aGuard.unlock(); |
868 | 0 | for (auto& rEvent : aEventList.maEvents) |
869 | 0 | { |
870 | 0 | if (rEvent.mxAttributes.is()) |
871 | 0 | { |
872 | 0 | rEvent.mxAttributes->clear(); |
873 | 0 | if( rEntity.mxNamespaceHandler.is() ) |
874 | 0 | rEvent.mxDeclAttributes->clear(); |
875 | 0 | } |
876 | 0 | aEventList.mbIsAttributesEmpty = true; |
877 | 0 | } |
878 | 0 | aGuard.lock(); |
879 | 0 | } |
880 | |
|
881 | 0 | rEntity.maUsedEvents.push(std::move(aEventList)); |
882 | 0 | } |
883 | 0 | } while (!done); |
884 | 0 | aEnsureFree.joinThread(); |
885 | 0 | deleteUsedEvents(); |
886 | | |
887 | | // callbacks used inside XML_Parse may have caught an exception No need |
888 | | // to lock maSavedExceptionMutex here because parser thread is joined. |
889 | | // coverity[missing_lock : SUPPRESS] 2024.6.1 |
890 | 0 | if( rEntity.maSavedException.hasValue() ) |
891 | 0 | rEntity.throwException( mxDocumentLocator, true ); |
892 | 0 | } |
893 | 274k | else |
894 | 274k | { |
895 | 274k | parse(); |
896 | 274k | } |
897 | | |
898 | | // finish document |
899 | 274k | if( rEntity.mxDocumentHandler.is() ) |
900 | 135k | { |
901 | 135k | rEntity.mxDocumentHandler->endDocument(); |
902 | 135k | } |
903 | 274k | } |
904 | | |
905 | | void FastSaxParserImpl::setFastDocumentHandler( const Reference< XFastDocumentHandler >& Handler ) |
906 | 478k | { |
907 | 478k | maData.mxDocumentHandler = Handler; |
908 | 478k | } |
909 | | |
910 | | void FastSaxParserImpl::setTokenHandler( const Reference< XFastTokenHandler >& xHandler ) |
911 | 372k | { |
912 | 372k | assert( dynamic_cast< FastTokenHandlerBase *>( xHandler.get() ) && "we expect this handler to be a subclass of FastTokenHandlerBase" ); |
913 | 372k | maData.mxTokenHandler = dynamic_cast< FastTokenHandlerBase *>( xHandler.get() ); |
914 | 372k | } |
915 | | |
916 | | void FastSaxParserImpl::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken ) |
917 | 22.0M | { |
918 | 22.0M | if( NamespaceToken < FastToken::NAMESPACE ) |
919 | 0 | throw IllegalArgumentException("Invalid namespace token " + OUString::number(NamespaceToken), css::uno::Reference<css::uno::XInterface >(), 0); |
920 | | |
921 | 22.0M | if( GetNamespaceToken( NamespaceURL ) == FastToken::DONTKNOW ) |
922 | 22.0M | { |
923 | 22.0M | maNamespaceMap[ NamespaceURL ] = NamespaceToken; |
924 | 22.0M | return; |
925 | 22.0M | } |
926 | 0 | throw IllegalArgumentException("namespace URL is already registered: " + NamespaceURL, css::uno::Reference<css::uno::XInterface >(), 0); |
927 | 22.0M | } |
928 | | |
929 | | OUString const & FastSaxParserImpl::getNamespaceURL( std::u16string_view rPrefix ) |
930 | 0 | { |
931 | 0 | try |
932 | 0 | { |
933 | 0 | return GetNamespaceURL( OUStringToOString( rPrefix, RTL_TEXTENCODING_UTF8 ) ); |
934 | 0 | } |
935 | 0 | catch (const Exception&) |
936 | 0 | { |
937 | 0 | } |
938 | 0 | throw IllegalArgumentException(); |
939 | 0 | } |
940 | | |
941 | | void FastSaxParserImpl::setErrorHandler(const Reference< XErrorHandler > & Handler) |
942 | 0 | { |
943 | 0 | maData.mxErrorHandler = Handler; |
944 | 0 | } |
945 | | |
946 | | void FastSaxParserImpl::setNamespaceHandler( const Reference< XFastNamespaceHandler >& Handler ) |
947 | 150k | { |
948 | 150k | maData.mxNamespaceHandler = Handler; |
949 | 150k | } |
950 | | |
951 | | void FastSaxParserImpl::setCustomEntityNames( |
952 | | const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements) |
953 | 0 | { |
954 | 0 | m_Replacements.resize(replacements.size()); |
955 | 0 | for (size_t i = 0; i < replacements.size(); ++i) |
956 | 0 | { |
957 | 0 | m_Replacements[i].name = replacements[i].First; |
958 | 0 | m_Replacements[i].replacement = replacements[i].Second; |
959 | 0 | } |
960 | 0 | if (m_Replacements.size() > 1) |
961 | 0 | std::sort(m_Replacements.begin(), m_Replacements.end()); |
962 | 0 | } |
963 | | |
964 | | void FastSaxParserImpl::deleteUsedEvents() |
965 | 0 | { |
966 | 0 | Entity& rEntity = getEntity(); |
967 | 0 | std::unique_lock aGuard(rEntity.maEventProtector); |
968 | |
|
969 | 0 | while (!rEntity.maUsedEvents.empty()) |
970 | 0 | { |
971 | 0 | { // the block makes sure that aEventList is destructed outside the lock |
972 | 0 | EventList aEventList = std::move(rEntity.maUsedEvents.front()); |
973 | 0 | rEntity.maUsedEvents.pop(); |
974 | |
|
975 | 0 | aGuard.unlock(); // unlock |
976 | 0 | } |
977 | |
|
978 | 0 | aGuard.lock(); // lock |
979 | 0 | } |
980 | 0 | } |
981 | | |
982 | | void FastSaxParserImpl::produce( bool bForceFlush ) |
983 | 0 | { |
984 | 0 | Entity& rEntity = getEntity(); |
985 | 0 | if (!(bForceFlush || |
986 | 0 | rEntity.mnProducedEventsSize >= Entity::mnEventListSize)) |
987 | 0 | return; |
988 | | |
989 | 0 | std::unique_lock aGuard(rEntity.maEventProtector); |
990 | |
|
991 | 0 | while (rEntity.maPendingEvents.size() >= Entity::mnEventHighWater) |
992 | 0 | { // pause parsing for a bit |
993 | 0 | aGuard.unlock(); // unlock |
994 | 0 | rEntity.maProduceResume.wait(); |
995 | 0 | rEntity.maProduceResume.reset(); |
996 | 0 | aGuard.lock(); // lock |
997 | 0 | } |
998 | |
|
999 | 0 | rEntity.maPendingEvents.push(std::move(*rEntity.mxProducedEvents)); |
1000 | |
|
1001 | 0 | aGuard.unlock(); // unlock |
1002 | |
|
1003 | 0 | rEntity.mxProducedEvents.reset(); |
1004 | 0 | assert(!rEntity.mxProducedEvents); |
1005 | |
|
1006 | 0 | rEntity.maConsumeResume.set(); |
1007 | 0 | } |
1008 | | |
1009 | | bool FastSaxParserImpl::consume(EventList& rEventList) |
1010 | 0 | { |
1011 | 0 | Entity& rEntity = getEntity(); |
1012 | 0 | rEventList.mbIsAttributesEmpty = false; |
1013 | 0 | for (auto& rEvent : rEventList.maEvents) |
1014 | 0 | { |
1015 | 0 | switch (rEvent.maType) |
1016 | 0 | { |
1017 | 0 | case CallbackType::START_ELEMENT: |
1018 | 0 | rEntity.startElement( &rEvent ); |
1019 | 0 | break; |
1020 | 0 | case CallbackType::END_ELEMENT: |
1021 | 0 | rEntity.endElement(); |
1022 | 0 | break; |
1023 | 0 | case CallbackType::CHARACTERS: |
1024 | 0 | rEntity.characters( rEvent.msChars ); |
1025 | 0 | break; |
1026 | 0 | case CallbackType::PROCESSING_INSTRUCTION: |
1027 | 0 | rEntity.processingInstruction( |
1028 | 0 | rEvent.msNamespace, rEvent.msElementName ); // ( target, data ) |
1029 | 0 | break; |
1030 | 0 | case CallbackType::DONE: |
1031 | 0 | return false; |
1032 | 0 | case CallbackType::EXCEPTION: |
1033 | 0 | rEntity.throwException( mxDocumentLocator, false ); |
1034 | 0 | [[fallthrough]]; // avoid unreachable code warning with some compilers |
1035 | 0 | default: |
1036 | 0 | assert(false); |
1037 | 0 | return false; |
1038 | 0 | } |
1039 | 0 | } |
1040 | 0 | return true; |
1041 | 0 | } |
1042 | | |
1043 | | void FastSaxParserImpl::pushEntity(const ParserData& rEntityData, |
1044 | | xml::sax::InputSource const& rSource) |
1045 | 274k | { |
1046 | 274k | if (!rSource.aInputStream.is()) |
1047 | 24.3k | throw SAXException(u"No input source"_ustr, Reference<XInterface>(), Any()); |
1048 | | |
1049 | 250k | maEntities.emplace(rEntityData); |
1050 | 250k | mpTop = &maEntities.top(); |
1051 | | |
1052 | 250k | mpTop->maStructSource = rSource; |
1053 | | |
1054 | 250k | mpTop->maConverter.setInputStream(mpTop->maStructSource.aInputStream); |
1055 | 250k | if (!mpTop->maStructSource.sEncoding.isEmpty()) |
1056 | 0 | { |
1057 | 0 | mpTop->maConverter.setEncoding(OUStringToOString(mpTop->maStructSource.sEncoding, RTL_TEXTENCODING_ASCII_US)); |
1058 | 0 | } |
1059 | 250k | } |
1060 | | |
1061 | | void FastSaxParserImpl::popEntity() |
1062 | 250k | { |
1063 | 250k | maEntities.pop(); |
1064 | 250k | mpTop = !maEntities.empty() ? &maEntities.top() : nullptr; |
1065 | 250k | } |
1066 | | |
1067 | | // starts parsing with actual parser ! |
1068 | | void FastSaxParserImpl::parse() |
1069 | 250k | { |
1070 | 250k | const int BUFFER_SIZE = 16 * 1024; |
1071 | 250k | Sequence< sal_Int8 > seqOut( BUFFER_SIZE ); |
1072 | | |
1073 | 250k | Entity& rEntity = getEntity(); |
1074 | | |
1075 | | // set all necessary C-Callbacks |
1076 | 250k | static xmlSAXHandler callbacks; |
1077 | 250k | callbacks.startElementNs = call_callbackStartElement; |
1078 | 250k | callbacks.endElementNs = call_callbackEndElement; |
1079 | 250k | callbacks.characters = call_callbackCharacters; |
1080 | 250k | callbacks.processingInstruction = call_callbackProcessingInstruction; |
1081 | 250k | callbacks.getEntity = call_callbackGetEntity; |
1082 | 250k | callbacks.initialized = XML_SAX2_MAGIC; |
1083 | 250k | int nRead = 0; |
1084 | 250k | do |
1085 | 521k | { |
1086 | 521k | nRead = rEntity.maConverter.readAndConvert( seqOut, BUFFER_SIZE ); |
1087 | 521k | if( nRead <= 0 ) |
1088 | 230k | { |
1089 | 230k | if( rEntity.mpParser != nullptr ) |
1090 | 230k | { |
1091 | 230k | if( xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), 0, 1 ) != XML_ERR_OK ) |
1092 | 94.0k | rEntity.throwException( mxDocumentLocator, true ); |
1093 | 230k | if (rEntity.hasException()) |
1094 | 1.03k | rEntity.throwException(mxDocumentLocator, true); |
1095 | 230k | } |
1096 | 230k | break; |
1097 | 230k | } |
1098 | | |
1099 | 291k | bool bContinue = true; |
1100 | 291k | if( rEntity.mpParser == nullptr ) |
1101 | 244k | { |
1102 | | // create parser with proper encoding (needs the first chunk of data) |
1103 | 244k | rEntity.mpParser = xmlCreatePushParserCtxt( &callbacks, this, |
1104 | 244k | reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, nullptr ); |
1105 | 244k | if( !rEntity.mpParser ) |
1106 | 0 | throw SAXException(u"Couldn't create parser"_ustr, Reference< XInterface >(), Any() ); |
1107 | | |
1108 | | // Tell libxml2 parser to decode entities in attribute values. |
1109 | | // Also allow XML attribute values which are larger than 10MB, because this used to work |
1110 | | // with expat. |
1111 | | // coverity[unsafe_xml_parse_config] - entity support is required |
1112 | 244k | xmlCtxtUseOptions(rEntity.mpParser, XML_PARSE_NOENT | XML_PARSE_HUGE); |
1113 | 244k | } |
1114 | 46.9k | else |
1115 | 46.9k | { |
1116 | 46.9k | bContinue = xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, 0 ) |
1117 | 46.9k | == XML_ERR_OK; |
1118 | 46.9k | } |
1119 | | |
1120 | | // callbacks used inside XML_Parse may have caught an exception |
1121 | 291k | if (!bContinue) |
1122 | 13.3k | { |
1123 | 13.3k | rEntity.throwException( mxDocumentLocator, true ); |
1124 | 13.3k | } |
1125 | 291k | if (rEntity.hasException()) |
1126 | 583 | { |
1127 | 583 | rEntity.throwException( mxDocumentLocator, true ); |
1128 | 583 | } |
1129 | 291k | } while( nRead > 0 ); |
1130 | 250k | rEntity.getEvent( CallbackType::DONE ); |
1131 | 250k | if( rEntity.mbEnableThreads ) |
1132 | 0 | produce( true ); |
1133 | 250k | } |
1134 | | |
1135 | | // The C-Callbacks |
1136 | | void FastSaxParserImpl::callbackStartElement(const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI, |
1137 | | int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes) |
1138 | 37.5M | { |
1139 | 37.5M | if (!pendingCharacters.empty()) |
1140 | 9.46M | sendPendingCharacters(); |
1141 | 37.5M | Entity& rEntity = getEntity(); |
1142 | 37.5M | if( rEntity.maNamespaceCount.empty() ) |
1143 | 225k | { |
1144 | 225k | rEntity.maNamespaceCount.push(0); |
1145 | 225k | DefineNamespace( "xml"_ostr, u"http://www.w3.org/XML/1998/namespace"_ustr); |
1146 | 225k | } |
1147 | 37.3M | else |
1148 | 37.3M | { |
1149 | 37.3M | rEntity.maNamespaceCount.push( rEntity.maNamespaceCount.top() ); |
1150 | 37.3M | } |
1151 | | |
1152 | | // create attribute map and process namespace instructions |
1153 | 37.5M | Event& rEvent = rEntity.getEvent( CallbackType::START_ELEMENT ); |
1154 | 37.5M | bool bIsAttributesEmpty = false; |
1155 | 37.5M | if ( rEntity.mbEnableThreads ) |
1156 | 0 | bIsAttributesEmpty = rEntity.getEventList().mbIsAttributesEmpty; |
1157 | | |
1158 | 37.5M | if (rEvent.mxAttributes.is()) |
1159 | 37.3M | { |
1160 | 37.3M | if( !bIsAttributesEmpty ) |
1161 | 37.3M | rEvent.mxAttributes->clear(); |
1162 | 37.3M | } |
1163 | 225k | else |
1164 | 225k | rEvent.mxAttributes.set( |
1165 | 225k | new FastAttributeList( rEntity.mxTokenHandler.get() ) ); |
1166 | | |
1167 | 37.5M | if( rEntity.mxNamespaceHandler.is() ) |
1168 | 25.3M | { |
1169 | 25.3M | if (rEvent.mxDeclAttributes.is()) |
1170 | 25.2M | { |
1171 | 25.2M | if( !bIsAttributesEmpty ) |
1172 | 25.2M | rEvent.mxDeclAttributes->clear(); |
1173 | 25.2M | } |
1174 | 109k | else |
1175 | 109k | rEvent.mxDeclAttributes.set( |
1176 | 109k | new FastAttributeList( rEntity.mxTokenHandler.get() ) ); |
1177 | 25.3M | } |
1178 | | |
1179 | 37.5M | OUString sNamespace; |
1180 | 37.5M | sal_Int32 nNamespaceToken = FastToken::DONTKNOW; |
1181 | 37.5M | if (!rEntity.maNamespaceStack.empty()) |
1182 | 37.3M | { |
1183 | 37.3M | sNamespace = rEntity.maNamespaceStack.top().msName; |
1184 | 37.3M | nNamespaceToken = rEntity.maNamespaceStack.top().mnToken; |
1185 | 37.3M | } |
1186 | | |
1187 | 37.5M | try |
1188 | 37.5M | { |
1189 | | /* #158414# Each element may define new namespaces, also for attributes. |
1190 | | First, process all namespaces, second, process the attributes after namespaces |
1191 | | have been initialized. */ |
1192 | | |
1193 | 37.5M | std::string_view sPrefix; // convert to string_view so we only do strlen() once. |
1194 | 37.5M | if (prefix != nullptr) |
1195 | 12.5M | sPrefix = XML_CAST(prefix); |
1196 | | // #158414# first: get namespaces |
1197 | 39.2M | for (int i = 0; i < numNamespaces * 2; i += 2) |
1198 | 1.65M | { |
1199 | | // namespaces[] is (prefix/URI) |
1200 | 1.65M | if( namespaces[ i ] != nullptr ) |
1201 | 1.37M | { |
1202 | 1.37M | OString aPrefix( XML_CAST( namespaces[ i ] )); |
1203 | 1.37M | OUString namespaceURL( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 ); |
1204 | 1.37M | NormalizeURI( namespaceURL ); |
1205 | 1.37M | DefineNamespace(aPrefix, namespaceURL); |
1206 | 1.37M | if( rEntity.mxNamespaceHandler.is() ) |
1207 | 1.11M | rEvent.mxDeclAttributes->addUnknown( OString( XML_CAST( namespaces[ i ] ) ), OString( XML_CAST( namespaces[ i + 1 ] ) ) ); |
1208 | 1.37M | } |
1209 | 288k | else |
1210 | 288k | { |
1211 | | // default namespace |
1212 | 288k | sNamespace = OUString( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 ); |
1213 | 288k | NormalizeURI( sNamespace ); |
1214 | 288k | nNamespaceToken = GetNamespaceToken( sNamespace ); |
1215 | 288k | if( rEntity.mxNamespaceHandler.is() ) |
1216 | 220k | rEvent.mxDeclAttributes->addUnknown( ""_ostr, OString( XML_CAST( namespaces[ i + 1 ] ) ) ); |
1217 | 288k | } |
1218 | 1.65M | } |
1219 | | |
1220 | 37.5M | if ( rEntity.mxTokenHandler.is() ) |
1221 | 37.5M | { |
1222 | | // #158414# second: fill attribute list with other attributes |
1223 | 37.5M | rEvent.mxAttributes->reserve( numAttributes ); |
1224 | 59.3M | for (int i = 0; i < numAttributes * 5; i += 5) |
1225 | 21.7M | { |
1226 | | // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd ) |
1227 | 21.7M | if( attributes[ i + 1 ] != nullptr ) |
1228 | 8.62M | { |
1229 | 8.62M | sal_Int32 nAttributeToken = GetTokenWithPrefix(XML_CAST(attributes[ i + 1 ]), attributes[ i ]); |
1230 | 8.62M | if( nAttributeToken != FastToken::DONTKNOW ) |
1231 | 7.21M | rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) ); |
1232 | 1.41M | else |
1233 | 1.41M | addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes); |
1234 | 8.62M | } |
1235 | 13.1M | else |
1236 | 13.1M | { |
1237 | 13.1M | sal_Int32 nAttributeToken = GetToken(attributes[ i ]); |
1238 | 13.1M | if( nAttributeToken != FastToken::DONTKNOW ) |
1239 | 12.5M | rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) ); |
1240 | 552k | else |
1241 | 552k | { |
1242 | 552k | SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes[ i ] ) << "=" << |
1243 | 552k | OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] )); |
1244 | 552k | rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ), |
1245 | 552k | OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] )); |
1246 | 552k | } |
1247 | 13.1M | } |
1248 | 21.7M | } |
1249 | | |
1250 | 37.5M | if( !sPrefix.empty() ) |
1251 | 12.5M | rEvent.mnElementToken = GetTokenWithPrefix(sPrefix, localName); |
1252 | 24.9M | else if( !sNamespace.isEmpty() ) |
1253 | 5.78M | rEvent.mnElementToken = GetTokenWithContextNamespace(nNamespaceToken, localName); |
1254 | 19.2M | else |
1255 | 19.2M | rEvent.mnElementToken = GetToken(localName); |
1256 | 37.5M | } |
1257 | 18.4E | else |
1258 | 18.4E | { |
1259 | 18.4E | for (int i = 0; i < numAttributes * 5; i += 5) |
1260 | 0 | { |
1261 | 0 | if( attributes[ i + 1 ] != nullptr ) |
1262 | 0 | addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes); |
1263 | 0 | else |
1264 | 0 | rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ), |
1265 | 0 | OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] )); |
1266 | 0 | } |
1267 | | |
1268 | 18.4E | rEvent.mnElementToken = FastToken::DONTKNOW; |
1269 | 18.4E | } |
1270 | | |
1271 | 37.5M | if( rEvent.mnElementToken == FastToken::DONTKNOW ) |
1272 | 19.4M | { |
1273 | 19.4M | OUString aElementPrefix; |
1274 | 19.4M | if( !sPrefix.empty() ) |
1275 | 1.03M | { |
1276 | 1.03M | aElementPrefix = OUString( sPrefix.data(), sPrefix.size(), RTL_TEXTENCODING_UTF8 ); |
1277 | 1.03M | if ( URI != nullptr ) |
1278 | 369k | sNamespace = OUString( XML_CAST( URI ), strlen( XML_CAST( URI )), RTL_TEXTENCODING_UTF8 ); |
1279 | 665k | else if ( m_bIgnoreMissingNSDecl ) |
1280 | 665k | sNamespace.clear(); |
1281 | 0 | else |
1282 | 0 | throw SAXException("No namespace defined for " + aElementPrefix, {}, {}); |
1283 | 1.03M | nNamespaceToken = GetNamespaceToken( sNamespace ); |
1284 | 1.03M | } |
1285 | 19.4M | OUString aElementLocalName( XML_CAST( localName ), strlen( XML_CAST( localName )), RTL_TEXTENCODING_UTF8 ); |
1286 | 19.4M | rEvent.msNamespace = sNamespace; |
1287 | 19.4M | if( aElementPrefix.isEmpty() ) |
1288 | 18.3M | rEvent.msElementName = std::move(aElementLocalName); |
1289 | 1.03M | else |
1290 | 1.03M | rEvent.msElementName = aElementPrefix + ":" + aElementLocalName; |
1291 | 19.4M | } |
1292 | 18.1M | else // token is always preferred. |
1293 | 18.1M | rEvent.msElementName.clear(); |
1294 | | |
1295 | 37.5M | rEntity.maNamespaceStack.push( NameWithToken(sNamespace, nNamespaceToken) ); |
1296 | 37.5M | if (rEntity.mbEnableThreads) |
1297 | 0 | produce(); |
1298 | 37.5M | else |
1299 | 37.5M | { |
1300 | 37.5M | SAL_INFO("sax.fastparser", " startElement line " << mxDocumentLocator->getLineNumber() << " column " << mxDocumentLocator->getColumnNumber() << " " << ( prefix ? XML_CAST(prefix) : "(null)" ) << ":" << localName); |
1301 | 37.5M | rEntity.startElement( &rEvent ); |
1302 | 37.5M | } |
1303 | 37.5M | } |
1304 | 37.5M | catch (...) |
1305 | 37.5M | { |
1306 | 964 | rEntity.saveException( ::cppu::getCaughtException() ); |
1307 | 964 | } |
1308 | 37.5M | } |
1309 | | |
1310 | | void FastSaxParserImpl::addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes) |
1311 | 1.41M | { |
1312 | 1.41M | OUString aNamespaceURI; |
1313 | 1.41M | if ( !m_bIgnoreMissingNSDecl || attributes[i + 2] != nullptr ) |
1314 | 761k | aNamespaceURI = OUString( XML_CAST( attributes[ i + 2 ] ), strlen( XML_CAST( attributes[ i + 2 ] )), RTL_TEXTENCODING_UTF8 ); |
1315 | 1.41M | const OString aPrefix( XML_CAST( attributes[ i + 1 ] )); |
1316 | 1.41M | const OString aLocalName( XML_CAST( attributes[ i ] )); |
1317 | 1.41M | OString aQualifiedName = (aPrefix.isEmpty())? aLocalName : aPrefix + ":" + aLocalName; |
1318 | 1.41M | xAttributes->addUnknown( aNamespaceURI, aQualifiedName, |
1319 | 1.41M | OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] )); |
1320 | 1.41M | SAL_INFO("xmloff", "unknown element " << aQualifiedName << " " << aNamespaceURI); |
1321 | 1.41M | } |
1322 | | |
1323 | | void FastSaxParserImpl::callbackEndElement() |
1324 | 17.4M | { |
1325 | 17.4M | if (!pendingCharacters.empty()) |
1326 | 3.27M | sendPendingCharacters(); |
1327 | 17.4M | Entity& rEntity = getEntity(); |
1328 | 17.4M | SAL_WARN_IF(rEntity.maNamespaceCount.empty(), "sax", "Empty NamespaceCount"); |
1329 | 17.4M | if( !rEntity.maNamespaceCount.empty() ) |
1330 | 17.4M | rEntity.maNamespaceCount.pop(); |
1331 | | |
1332 | 17.4M | SAL_WARN_IF(rEntity.maNamespaceStack.empty(), "sax", "Empty NamespaceStack"); |
1333 | 17.4M | if( !rEntity.maNamespaceStack.empty() ) |
1334 | 17.4M | rEntity.maNamespaceStack.pop(); |
1335 | | |
1336 | 17.4M | rEntity.getEvent( CallbackType::END_ELEMENT ); |
1337 | 17.4M | if (rEntity.mbEnableThreads) |
1338 | 0 | produce(); |
1339 | 17.4M | else |
1340 | 17.4M | rEntity.endElement(); |
1341 | 17.4M | } |
1342 | | |
1343 | | void FastSaxParserImpl::callbackCharacters( const xmlChar* s, int nLen ) |
1344 | 14.4M | { |
1345 | | // SAX interface allows that the characters callback splits content of one XML node |
1346 | | // (e.g. because there's an entity that needs decoding), however for consumers it's |
1347 | | // simpler FastSaxParser's character callback provides the whole string at once, |
1348 | | // so merge data from possible multiple calls and send them at once (before the element |
1349 | | // ends or another one starts). |
1350 | | // |
1351 | | // We use a std::vector<char> to avoid calling into the OUString constructor more than once when |
1352 | | // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly |
1353 | | // often in writer documents. |
1354 | 14.4M | int nOriginalLen = pendingCharacters.size(); |
1355 | 14.4M | pendingCharacters.resize(nOriginalLen + nLen); |
1356 | 14.4M | memcpy(pendingCharacters.data() + nOriginalLen, s, nLen); |
1357 | 14.4M | } |
1358 | | |
1359 | | void FastSaxParserImpl::sendPendingCharacters() |
1360 | 12.7M | { |
1361 | 12.7M | Entity& rEntity = getEntity(); |
1362 | 12.7M | OUString sChars( pendingCharacters.data(), pendingCharacters.size(), RTL_TEXTENCODING_UTF8 ); |
1363 | 12.7M | if (rEntity.mbEnableThreads) |
1364 | 0 | { |
1365 | 0 | Event& rEvent = rEntity.getEvent( CallbackType::CHARACTERS ); |
1366 | 0 | rEvent.msChars = std::move(sChars); |
1367 | 0 | produce(); |
1368 | 0 | } |
1369 | 12.7M | else |
1370 | 12.7M | rEntity.characters( sChars ); |
1371 | 12.7M | pendingCharacters.resize(0); |
1372 | 12.7M | } |
1373 | | |
1374 | | void FastSaxParserImpl::callbackProcessingInstruction( const xmlChar *target, const xmlChar *data ) |
1375 | 71.2k | { |
1376 | 71.2k | if (!pendingCharacters.empty()) |
1377 | 8.00k | sendPendingCharacters(); |
1378 | 71.2k | Entity& rEntity = getEntity(); |
1379 | 71.2k | Event& rEvent = rEntity.getEvent( CallbackType::PROCESSING_INSTRUCTION ); |
1380 | | |
1381 | | // This event is very rare, so no need to waste extra space for this |
1382 | | // Using namespace and element strings to be target and data in that order. |
1383 | 71.2k | rEvent.msNamespace = OUString( XML_CAST( target ), strlen( XML_CAST( target ) ), RTL_TEXTENCODING_UTF8 ); |
1384 | 71.2k | if ( data != nullptr ) |
1385 | 61.4k | rEvent.msElementName = OUString( XML_CAST( data ), strlen( XML_CAST( data ) ), RTL_TEXTENCODING_UTF8 ); |
1386 | 9.77k | else |
1387 | 9.77k | rEvent.msElementName.clear(); |
1388 | | |
1389 | 71.2k | if (rEntity.mbEnableThreads) |
1390 | 0 | produce(); |
1391 | 71.2k | else |
1392 | 71.2k | rEntity.processingInstruction( rEvent.msNamespace, rEvent.msElementName ); |
1393 | 71.2k | } |
1394 | | |
1395 | | xmlEntityPtr FastSaxParserImpl::callbackGetEntity( const xmlChar *name ) |
1396 | 70.1k | { |
1397 | 70.1k | if( !name ) |
1398 | 0 | return xmlGetPredefinedEntity(name); |
1399 | 70.1k | const char* dname = XML_CAST(name); |
1400 | 70.1k | int lname = strlen(dname); |
1401 | 70.1k | if( lname == 0 ) |
1402 | 0 | return xmlGetPredefinedEntity(name); |
1403 | 70.1k | if (m_Replacements.size() > 0) |
1404 | 0 | { |
1405 | 0 | auto it = std::lower_bound(m_Replacements.begin(), m_Replacements.end(), dname); |
1406 | 0 | if (it != m_Replacements.end() && it->name.compareToAscii(dname) == 0) |
1407 | 0 | { |
1408 | 0 | xmlEntityPtr entpt = xmlNewEntity( |
1409 | 0 | nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr, |
1410 | 0 | BAD_CAST(OUStringToOString(it->replacement, RTL_TEXTENCODING_UTF8).getStr())); |
1411 | 0 | m_TemporalEntities.push_back(entpt); |
1412 | 0 | return entpt; |
1413 | 0 | } |
1414 | 0 | } |
1415 | 70.1k | if( lname < 2 ) |
1416 | 26.0k | return xmlGetPredefinedEntity(name); |
1417 | 44.0k | if ( dname[0] == '#' ) |
1418 | 0 | { |
1419 | 0 | sal_uInt32 cval = 0; |
1420 | 0 | if( dname[1] == 'x' || dname[1] == 'X' ) |
1421 | 0 | { |
1422 | 0 | if( lname < 3 ) |
1423 | 0 | return xmlGetPredefinedEntity(name); |
1424 | 0 | cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 16 ) ); |
1425 | 0 | if( cval == 0 ) |
1426 | 0 | return xmlGetPredefinedEntity(name); |
1427 | 0 | OUString vname( &cval, 1 ); |
1428 | 0 | xmlEntityPtr entpt |
1429 | 0 | = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr, |
1430 | 0 | BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr())); |
1431 | 0 | m_TemporalEntities.push_back(entpt); |
1432 | 0 | return entpt; |
1433 | 0 | } |
1434 | 0 | else |
1435 | 0 | { |
1436 | 0 | cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 10 ) ); |
1437 | 0 | if( cval == 0 ) |
1438 | 0 | return xmlGetPredefinedEntity(name); |
1439 | 0 | OUString vname(&cval, 1); |
1440 | 0 | xmlEntityPtr entpt |
1441 | 0 | = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr, |
1442 | 0 | BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr())); |
1443 | 0 | m_TemporalEntities.push_back(entpt); |
1444 | 0 | return entpt; |
1445 | 0 | } |
1446 | 0 | } |
1447 | 44.0k | return xmlGetPredefinedEntity(name); |
1448 | 44.0k | } |
1449 | | |
1450 | 372k | FastSaxParser::FastSaxParser() : mpImpl(new FastSaxParserImpl) {} |
1451 | | |
1452 | | FastSaxParser::~FastSaxParser() |
1453 | 372k | { |
1454 | 372k | } |
1455 | | |
1456 | | void SAL_CALL |
1457 | | FastSaxParser::initialize(css::uno::Sequence< css::uno::Any > const& rArguments) |
1458 | 159k | { |
1459 | 159k | if (!rArguments.hasElements()) |
1460 | 0 | return; |
1461 | | |
1462 | 159k | OUString str; |
1463 | 159k | if ( !(rArguments[0] >>= str) ) |
1464 | 0 | throw IllegalArgumentException(); |
1465 | | |
1466 | 159k | if ( str == "IgnoreMissingNSDecl" ) |
1467 | 125k | mpImpl->m_bIgnoreMissingNSDecl = true; |
1468 | 33.6k | else if ( str == "DoSmeplease" ) |
1469 | 0 | ; //just ignore as this is already immune to billion laughs |
1470 | 33.6k | else if ( str == "DisableThreadedParser" ) |
1471 | 33.6k | mpImpl->m_bDisableThreadedParser = true; |
1472 | 0 | else |
1473 | 0 | throw IllegalArgumentException(); |
1474 | | |
1475 | 159k | } |
1476 | | |
1477 | | void FastSaxParser::parseStream( const xml::sax::InputSource& aInputSource ) |
1478 | 274k | { |
1479 | 274k | mpImpl->parseStream(aInputSource); |
1480 | 274k | } |
1481 | | |
1482 | | void FastSaxParser::setFastDocumentHandler( const uno::Reference<xml::sax::XFastDocumentHandler>& Handler ) |
1483 | 478k | { |
1484 | 478k | mpImpl->setFastDocumentHandler(Handler); |
1485 | 478k | } |
1486 | | |
1487 | | void FastSaxParser::setTokenHandler( const uno::Reference<xml::sax::XFastTokenHandler>& Handler ) |
1488 | 372k | { |
1489 | 372k | mpImpl->setTokenHandler(Handler); |
1490 | 372k | } |
1491 | | |
1492 | | void FastSaxParser::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken ) |
1493 | 22.0M | { |
1494 | 22.0M | mpImpl->registerNamespace(NamespaceURL, NamespaceToken); |
1495 | 22.0M | } |
1496 | | |
1497 | | OUString FastSaxParser::getNamespaceURL( const OUString& rPrefix ) |
1498 | 0 | { |
1499 | 0 | return mpImpl->getNamespaceURL(rPrefix); |
1500 | 0 | } |
1501 | | |
1502 | | void FastSaxParser::setErrorHandler( const uno::Reference< xml::sax::XErrorHandler >& Handler ) |
1503 | 0 | { |
1504 | 0 | mpImpl->setErrorHandler(Handler); |
1505 | 0 | } |
1506 | | |
1507 | | void FastSaxParser::setEntityResolver( const uno::Reference< xml::sax::XEntityResolver >& ) |
1508 | 0 | { |
1509 | | // not implemented |
1510 | 0 | } |
1511 | | |
1512 | | void FastSaxParser::setLocale( const lang::Locale& ) |
1513 | 0 | { |
1514 | | // not implemented |
1515 | 0 | } |
1516 | | |
1517 | | void FastSaxParser::setNamespaceHandler( const uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler) |
1518 | 150k | { |
1519 | 150k | mpImpl->setNamespaceHandler(Handler); |
1520 | 150k | } |
1521 | | |
1522 | | OUString FastSaxParser::getImplementationName() |
1523 | 0 | { |
1524 | 0 | return u"com.sun.star.comp.extensions.xml.sax.FastParser"_ustr; |
1525 | 0 | } |
1526 | | |
1527 | | void FastSaxParser::setCustomEntityNames( |
1528 | | const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements) |
1529 | 0 | { |
1530 | 0 | mpImpl->setCustomEntityNames(replacements); |
1531 | 0 | } |
1532 | | |
1533 | | sal_Bool FastSaxParser::supportsService( const OUString& ServiceName ) |
1534 | 0 | { |
1535 | 0 | return cppu::supportsService(this, ServiceName); |
1536 | 0 | } |
1537 | | |
1538 | | uno::Sequence<OUString> FastSaxParser::getSupportedServiceNames() |
1539 | 0 | { |
1540 | 0 | return { u"com.sun.star.xml.sax.FastParser"_ustr }; |
1541 | 0 | } |
1542 | | |
1543 | | } // namespace sax_fastparser |
1544 | | |
1545 | | extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * |
1546 | | com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation( |
1547 | | css::uno::XComponentContext *, |
1548 | | css::uno::Sequence<css::uno::Any> const &) |
1549 | 188k | { |
1550 | 188k | return cppu::acquire(new FastSaxParser); |
1551 | 188k | } |
1552 | | |
1553 | | // ---------------------------------------------------------- |
1554 | | // copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases |
1555 | | // for various dodgy namespace decls in the wild. |
1556 | | |
1557 | | static bool NormalizeW3URI( OUString& rName ); |
1558 | | static bool NormalizeOasisURN( OUString& rName ); |
1559 | | |
1560 | | static void NormalizeURI( OUString& rName ) |
1561 | 1.65M | { |
1562 | | // try OASIS + W3 URI normalization |
1563 | 1.65M | bool bSuccess = NormalizeOasisURN( rName ); |
1564 | 1.65M | if( ! bSuccess ) |
1565 | 1.34M | NormalizeW3URI( rName ); |
1566 | 1.65M | } |
1567 | | |
1568 | | constexpr OUStringLiteral XML_URI_W3_PREFIX(u"http://www.w3.org/"); |
1569 | | constexpr OUStringLiteral XML_URI_XFORMS_SUFFIX(u"/xforms"); |
1570 | | constexpr OUStringLiteral XML_N_XFORMS_1_0(u"http://www.w3.org/2002/xforms"); |
1571 | | constexpr OUStringLiteral XML_N_SVG(u"http://www.w3.org/2000/svg"); |
1572 | | constexpr OUStringLiteral XML_N_SVG_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"); |
1573 | | constexpr OUStringLiteral XML_N_FO(u"http://www.w3.org/1999/XSL/Format"); |
1574 | | constexpr OUStringLiteral XML_N_FO_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"); |
1575 | | constexpr OUStringLiteral XML_N_SMIL(u"http://www.w3.org/2001/SMIL20/"); |
1576 | | constexpr OUStringLiteral XML_N_SMIL_OLD(u"http://www.w3.org/2001/SMIL20"); |
1577 | | constexpr OUStringLiteral XML_N_SMIL_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0"); |
1578 | | constexpr OUStringLiteral XML_URN_OASIS_NAMES_TC(u"urn:oasis:names:tc"); |
1579 | | constexpr OUStringLiteral XML_XMLNS(u"xmlns"); |
1580 | | constexpr OUStringLiteral XML_OPENDOCUMENT(u"opendocument"); |
1581 | | constexpr OUStringLiteral XML_1_0(u"1.0"); |
1582 | | |
1583 | | static bool NormalizeW3URI( OUString& rName ) |
1584 | 1.34M | { |
1585 | | // check if URI matches: |
1586 | | // http://www.w3.org/[0-9]*/[:letter:]* |
1587 | | // (year)/(WG name) |
1588 | | // For the following WG/standards names: |
1589 | | // - xforms |
1590 | | |
1591 | 1.34M | bool bSuccess = false; |
1592 | 1.34M | const OUString sURIPrefix = XML_URI_W3_PREFIX; |
1593 | 1.34M | if( rName.startsWith( sURIPrefix ) ) |
1594 | 132k | { |
1595 | 132k | const OUString sURISuffix = XML_URI_XFORMS_SUFFIX ; |
1596 | 132k | sal_Int32 nCompareFrom = rName.getLength() - sURISuffix.getLength(); |
1597 | 132k | if( rName.subView( nCompareFrom ) == sURISuffix ) |
1598 | 10.7k | { |
1599 | | // found W3 prefix, and xforms suffix |
1600 | 10.7k | rName = XML_N_XFORMS_1_0; |
1601 | 10.7k | bSuccess = true; |
1602 | 10.7k | } |
1603 | 132k | } |
1604 | 1.34M | return bSuccess; |
1605 | 1.34M | } |
1606 | | |
1607 | | static bool NormalizeOasisURN( OUString& rName ) |
1608 | 1.65M | { |
1609 | | // #i38644# |
1610 | | // we exported the wrong namespace for smil, so we correct this here on load |
1611 | | // for older documents |
1612 | 1.65M | if( rName == XML_N_SVG ) |
1613 | 463 | { |
1614 | 463 | rName = XML_N_SVG_COMPAT; |
1615 | 463 | return true; |
1616 | 463 | } |
1617 | 1.65M | else if( rName == XML_N_FO ) |
1618 | 0 | { |
1619 | 0 | rName = XML_N_FO_COMPAT; |
1620 | 0 | return true; |
1621 | 0 | } |
1622 | 1.65M | else if( rName == XML_N_SMIL || rName == XML_N_SMIL_OLD ) |
1623 | 0 | { |
1624 | 0 | rName = XML_N_SMIL_COMPAT; |
1625 | 0 | return true; |
1626 | 0 | } |
1627 | | |
1628 | | |
1629 | | // Check if URN matches |
1630 | | // :urn:oasis:names:tc:[^:]*:xmlns:[^:]*:1.[^:]* |
1631 | | // |---| |---| |-----| |
1632 | | // TC-Id Sub-Id Version |
1633 | | |
1634 | 1.65M | sal_Int32 nNameLen = rName.getLength(); |
1635 | | // :urn:oasis:names:tc.* |
1636 | 1.65M | const OUString aOasisURN = XML_URN_OASIS_NAMES_TC; |
1637 | 1.65M | if( !rName.startsWith( aOasisURN ) ) |
1638 | 1.27M | return false; |
1639 | | |
1640 | | // :urn:oasis:names:tc:.* |
1641 | 388k | sal_Int32 nPos = aOasisURN.getLength(); |
1642 | 388k | if( nPos >= nNameLen || rName[nPos] != ':' ) |
1643 | 5.79k | return false; |
1644 | | |
1645 | | // :urn:oasis:names:tc:[^:]:.* |
1646 | 382k | sal_Int32 nTCIdStart = nPos+1; |
1647 | 382k | sal_Int32 nTCIdEnd = rName.indexOf( ':', nTCIdStart ); |
1648 | 382k | if( -1 == nTCIdEnd ) |
1649 | 2.64k | return false; |
1650 | | |
1651 | | // :urn:oasis:names:tc:[^:]:xmlns.* |
1652 | 379k | nPos = nTCIdEnd + 1; |
1653 | 379k | std::u16string_view sTmp( rName.subView( nPos ) ); |
1654 | 379k | const OUString aXMLNS = XML_XMLNS; |
1655 | 379k | if( !o3tl::starts_with(sTmp, aXMLNS ) ) |
1656 | 33.4k | return false; |
1657 | | |
1658 | | // :urn:oasis:names:tc:[^:]:xmlns:.* |
1659 | 346k | nPos += aXMLNS.getLength(); |
1660 | 346k | if( nPos >= nNameLen || rName[nPos] != ':' ) |
1661 | 3.17k | return false; |
1662 | | |
1663 | | // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:.* |
1664 | 343k | nPos = rName.indexOf( ':', nPos+1 ); |
1665 | 343k | if( -1 == nPos ) |
1666 | 2.35k | return false; |
1667 | | |
1668 | | // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:[^:][^:][^:][^:]* |
1669 | 340k | sal_Int32 nVersionStart = nPos+1; |
1670 | 340k | if( nVersionStart+2 >= nNameLen || |
1671 | 340k | -1 != rName.indexOf( ':', nVersionStart ) ) |
1672 | 5.42k | return false; |
1673 | | |
1674 | | // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:1\.[^:][^:]* |
1675 | 335k | if( rName[nVersionStart] != '1' || rName[nVersionStart+1] != '.' ) |
1676 | 19.8k | return false; |
1677 | | |
1678 | | // replace [tcid] with current TCID and version with current version. |
1679 | | |
1680 | 315k | rName = rName.subView( 0, nTCIdStart ) + |
1681 | 315k | XML_OPENDOCUMENT + |
1682 | 315k | rName.subView( nTCIdEnd, nVersionStart-nTCIdEnd ) + |
1683 | 315k | XML_1_0; |
1684 | | |
1685 | 315k | return true; |
1686 | 335k | } |
1687 | | |
1688 | | |
1689 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |