/src/poco/XML/src/ParserEngine.h
Line | Count | Source |
1 | | // |
2 | | // ParserEngine.h |
3 | | // |
4 | | // Library: XML |
5 | | // Package: XML |
6 | | // Module: ParserEngine |
7 | | // |
8 | | // Definition of the ParseEngine class. |
9 | | // |
10 | | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
11 | | // and Contributors. |
12 | | // |
13 | | // SPDX-License-Identifier: BSL-1.0 |
14 | | |
15 | | |
16 | | #ifndef XML_ParserEngine_INCLUDED |
17 | | #define XML_ParserEngine_INCLUDED |
18 | | |
19 | | |
20 | | #include "Poco/XML/XML.h" |
21 | | #include "Poco/XML/XMLString.h" |
22 | | #include "Poco/XML/XMLStream.h" |
23 | | #include "Poco/SAX/Locator.h" |
24 | | #include "Poco/TextEncoding.h" |
25 | | #include <expat.h> |
26 | | #include <map> |
27 | | #include <vector> |
28 | | |
29 | | |
30 | | namespace Poco { |
31 | | namespace XML { |
32 | | |
33 | | |
34 | | class InputSource; |
35 | | class EntityResolver; |
36 | | class DTDHandler; |
37 | | class DeclHandler; |
38 | | class ContentHandler; |
39 | | class LexicalHandler; |
40 | | class ErrorHandler; |
41 | | class NamespaceStrategy; |
42 | | class ContextLocator; |
43 | | |
44 | | |
45 | | class XML_API ParserEngine: public Locator |
46 | | /// This class provides an object-oriented, stream-based, |
47 | | /// low-level interface to the XML Parser Toolkit (expat). |
48 | | /// It is strongly recommended, that you use the |
49 | | /// SAX parser classes (which are based on this |
50 | | /// class) instead of this class, since they provide |
51 | | /// a standardized, higher-level interface to the parser. |
52 | | { |
53 | | public: |
54 | | ParserEngine(); |
55 | | /// Creates the parser engine. |
56 | | |
57 | | ParserEngine(const XMLString& encoding); |
58 | | /// Creates the parser engine and passes the encoding |
59 | | /// to the underlying parser. |
60 | | |
61 | | ~ParserEngine(); |
62 | | /// Destroys the parser. |
63 | | |
64 | | void setEncoding(const XMLString& encoding); |
65 | | /// Sets the encoding used by expat. The encoding must be |
66 | | /// set before parsing begins, otherwise it will be ignored. |
67 | | |
68 | | const XMLString& getEncoding() const; |
69 | | /// Returns the encoding used by expat. |
70 | | |
71 | | void addEncoding(const XMLString& name, Poco::TextEncoding* pEncoding); |
72 | | /// Adds an encoding to the parser. |
73 | | |
74 | | void setNamespaceStrategy(NamespaceStrategy* pStrategy); |
75 | | /// Sets the NamespaceStrategy used by the parser. |
76 | | /// The parser takes ownership of the strategy object |
77 | | /// and deletes it when it's no longer needed. |
78 | | /// The default is NoNamespacesStrategy. |
79 | | |
80 | | NamespaceStrategy* getNamespaceStrategy() const; |
81 | | /// Returns the NamespaceStrategy currently in use. |
82 | | |
83 | | void setExpandInternalEntities(bool flag = true); |
84 | | /// Enables/disables expansion of internal entities (enabled by |
85 | | /// default). If entity expansion is disabled, internal entities |
86 | | /// are reported via the default handler. |
87 | | /// Must be set before parsing begins, otherwise it will be |
88 | | /// ignored. |
89 | | |
90 | | bool getExpandInternalEntities() const; |
91 | | /// Returns true if internal entities will be expanded automatically, |
92 | | /// which is the default. |
93 | | |
94 | | void setExternalGeneralEntities(bool flag = true); |
95 | | /// Enable or disable processing of external general entities. |
96 | | |
97 | | bool getExternalGeneralEntities() const; |
98 | | /// Returns true if external general entities will be processed; false otherwise. |
99 | | |
100 | | void setExternalParameterEntities(bool flag = true); |
101 | | /// Enable or disable processing of external parameter entities. |
102 | | |
103 | | bool getExternalParameterEntities() const; |
104 | | /// Returns true if external parameter entities will be processed; false otherwise. |
105 | | |
106 | | void setEntityResolver(EntityResolver* pResolver); |
107 | | /// Allow an application to register an entity resolver. |
108 | | |
109 | | EntityResolver* getEntityResolver() const; |
110 | | /// Return the current entity resolver. |
111 | | |
112 | | void setDTDHandler(DTDHandler* pDTDHandler); |
113 | | /// Allow an application to register a DTD event handler. |
114 | | |
115 | | DTDHandler* getDTDHandler() const; |
116 | | /// Return the current DTD handler. |
117 | | |
118 | | void setDeclHandler(DeclHandler* pDeclHandler); |
119 | | /// Allow an application to register a DTD declarations event handler. |
120 | | |
121 | | DeclHandler* getDeclHandler() const; |
122 | | /// Return the current DTD declarations handler. |
123 | | |
124 | | void setContentHandler(ContentHandler* pContentHandler); |
125 | | /// Allow an application to register a content event handler. |
126 | | |
127 | | ContentHandler* getContentHandler() const; |
128 | | /// Return the current content handler. |
129 | | |
130 | | void setLexicalHandler(LexicalHandler* pLexicalHandler); |
131 | | /// Allow an application to register a lexical event handler. |
132 | | |
133 | | LexicalHandler* getLexicalHandler() const; |
134 | | /// Return the current lexical handler. |
135 | | |
136 | | void setErrorHandler(ErrorHandler* pErrorHandler); |
137 | | /// Allow an application to register an error event handler. |
138 | | |
139 | | ErrorHandler* getErrorHandler() const; |
140 | | /// Return the current error handler. |
141 | | |
142 | | void setEnablePartialReads(bool flag = true); |
143 | | /// Enable or disable partial reads from the input source. |
144 | | /// |
145 | | /// This is useful for parsing XML from a socket stream for |
146 | | /// a protocol like XMPP, where basically single elements |
147 | | /// are read one at a time from the input source's stream, and |
148 | | /// following elements depend upon responses sent back to |
149 | | /// the peer. |
150 | | /// |
151 | | /// Normally, the parser always reads blocks of PARSE_BUFFER_SIZE |
152 | | /// at a time, and blocks until a complete block has been read (or |
153 | | /// the end of the stream has been reached). |
154 | | /// This allows for efficient parsing of "complete" XML documents, |
155 | | /// but fails in a case such as XMPP, where only XML fragments |
156 | | /// are sent at a time. |
157 | | |
158 | | bool getEnablePartialReads() const; |
159 | | /// Returns true if partial reads are enabled (see |
160 | | /// setEnablePartialReads()), false otherwise. |
161 | | |
162 | | void setBillionLaughsAttackProtectionMaximumAmplification(float maximumAmplificationFactor); |
163 | | /// Sets the maximum tolerated amplification factor |
164 | | /// for protection against Billion Laughs Attacks. |
165 | | /// |
166 | | /// The amplification factor is calculated as: |
167 | | /// amplification := (direct + indirect) / direct |
168 | | /// while parsing, whereas: |
169 | | /// - direct is the number of bytes read from the primary document in parsing and |
170 | | /// - indirect is the number of bytes added by expanding entities and reading of |
171 | | /// external DTD files, combined. |
172 | | /// |
173 | | /// maximumAmplificationFactor must be non-NaN and greater than or equal to 1.0. |
174 | | /// |
175 | | /// Requires an underlying Expat version >= 2.4.0. |
176 | | |
177 | | void setBillionLaughsAttackProtectionActivationThreshold(Poco::UInt64 activationThresholdBytes); |
178 | | /// Sets number of output bytes (including amplification from entity expansion and reading DTD files) |
179 | | /// needed to activate protection against Billion Laughs Attacks. |
180 | | /// |
181 | | /// Defaults to 8 MiB. |
182 | | /// |
183 | | /// Requires an underlying Expat version >= 2.4.0. |
184 | | |
185 | | void parse(InputSource* pInputSource); |
186 | | /// Parse an XML document from the given InputSource. |
187 | | |
188 | | void parse(const char* pBuffer, std::size_t size); |
189 | | /// Parses an XML document from the given buffer. |
190 | | |
191 | | // Locator |
192 | | XMLString getPublicId() const; |
193 | | /// Return the public identifier for the current document event. |
194 | | |
195 | | XMLString getSystemId() const; |
196 | | /// Return the system identifier for the current document event. |
197 | | |
198 | | int getLineNumber() const; |
199 | | /// Return the line number where the current document event ends. |
200 | | |
201 | | int getColumnNumber() const; |
202 | | /// Return the column number where the current document event ends. |
203 | | |
204 | | protected: |
205 | | void init(); |
206 | | /// initializes expat |
207 | | |
208 | | void parseByteInputStream(XMLByteInputStream& istr); |
209 | | /// Parses an entity from the given stream. |
210 | | |
211 | | void parseCharInputStream(XMLCharInputStream& istr); |
212 | | /// Parses an entity from the given stream. |
213 | | |
214 | | std::streamsize readBytes(XMLByteInputStream& istr, char* pBuffer, std::streamsize bufferSize); |
215 | | /// Reads at most bufferSize bytes from the given stream into the given buffer. |
216 | | |
217 | | std::streamsize readChars(XMLCharInputStream& istr, XMLChar* pBuffer, std::streamsize bufferSize); |
218 | | /// Reads at most bufferSize chars from the given stream into the given buffer. |
219 | | |
220 | | void handleError(int errorNo); |
221 | | /// Throws an XMLException with a message corresponding |
222 | | /// to the given Expat error code. |
223 | | |
224 | | void parseExternal(XML_Parser extParser, InputSource* pInputSource); |
225 | | /// Parse an XML document from the given InputSource. |
226 | | |
227 | | void parseExternalByteInputStream(XML_Parser extParser, XMLByteInputStream& istr); |
228 | | /// Parses an external entity from the given stream, with a separate parser. |
229 | | |
230 | | void parseExternalCharInputStream(XML_Parser extParser, XMLCharInputStream& istr); |
231 | | /// Parses an external entity from the given stream, with a separate parser. |
232 | | |
233 | | void pushContext(XML_Parser parser, InputSource* pInputSource); |
234 | | /// Pushes a new entry to the context stack. |
235 | | |
236 | | void popContext(); |
237 | | /// Pops the top-most entry from the context stack. |
238 | | |
239 | | void resetContext(); |
240 | | /// Resets and clears the context stack. |
241 | | |
242 | | const Locator& locator() const; |
243 | | /// Returns a locator denoting the current parse location. |
244 | | |
245 | | // expat handler procedures |
246 | | static void handleStartElement(void* userData, const XML_Char* name, const XML_Char** atts); |
247 | | static void handleEndElement(void* userData, const XML_Char* name); |
248 | | static void handleCharacterData(void* userData, const XML_Char* s, int len); |
249 | | static void handleProcessingInstruction(void* userData, const XML_Char* target, const XML_Char* data); |
250 | | static void handleDefault(void* userData, const XML_Char* s, int len); |
251 | | static void handleUnparsedEntityDecl(void* userData, const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, const XML_Char* notationName); |
252 | | static void handleNotationDecl(void* userData, const XML_Char* notationName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId); |
253 | | static int handleExternalEntityRef(XML_Parser parser, const XML_Char* openEntityNames, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId); |
254 | | static int handleUnknownEncoding(void* encodingHandlerData, const XML_Char* name, XML_Encoding* info); |
255 | | static void handleComment(void* userData, const XML_Char* data); |
256 | | static void handleStartCdataSection(void* userData); |
257 | | static void handleEndCdataSection(void* userData); |
258 | | static void handleStartNamespaceDecl(void* userData, const XML_Char* prefix, const XML_Char* uri); |
259 | | static void handleEndNamespaceDecl(void* userData, const XML_Char* prefix); |
260 | | static void handleStartDoctypeDecl(void* userData, const XML_Char* doctypeName, const XML_Char *systemId, const XML_Char* publicId, int hasInternalSubset); |
261 | | static void handleEndDoctypeDecl(void* userData); |
262 | | static void handleEntityDecl(void *userData, const XML_Char *entityName, int isParamEntity, const XML_Char *value, int valueLength, |
263 | | const XML_Char *base, const XML_Char *systemId, const XML_Char *publicId, const XML_Char *notationName); |
264 | | static void handleExternalParsedEntityDecl(void* userData, const XML_Char* entityName, const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId); |
265 | | static void handleInternalParsedEntityDecl(void* userData, const XML_Char* entityName, const XML_Char* replacementText, int replacementTextLength); |
266 | | static void handleSkippedEntity(void* userData, const XML_Char* entityName, int isParameterEntity); |
267 | | |
268 | | // encoding support |
269 | | static int convert(void *data, const char *s); |
270 | | |
271 | | private: |
272 | | typedef std::map<XMLString, Poco::TextEncoding*> EncodingMap; |
273 | | typedef std::vector<ContextLocator*> ContextStack; |
274 | | |
275 | | XML_Parser _parser; |
276 | | char* _pBuffer; |
277 | | bool _encodingSpecified; |
278 | | XMLString _encoding; |
279 | | bool _expandInternalEntities; |
280 | | bool _externalGeneralEntities; |
281 | | bool _externalParameterEntities; |
282 | | bool _enablePartialReads; |
283 | | NamespaceStrategy* _pNamespaceStrategy; |
284 | | EncodingMap _encodings; |
285 | | ContextStack _context; |
286 | | |
287 | | EntityResolver* _pEntityResolver; |
288 | | DTDHandler* _pDTDHandler; |
289 | | DeclHandler* _pDeclHandler; |
290 | | ContentHandler* _pContentHandler; |
291 | | LexicalHandler* _pLexicalHandler; |
292 | | ErrorHandler* _pErrorHandler; |
293 | | |
294 | | float _maximumAmplificationFactor; |
295 | | Poco::UInt64 _activationThresholdBytes; |
296 | | |
297 | | static const int PARSE_BUFFER_SIZE; |
298 | | static const XMLString EMPTY_STRING; |
299 | | }; |
300 | | |
301 | | |
302 | | // |
303 | | // inlines |
304 | | // |
305 | | inline const XMLString& ParserEngine::getEncoding() const |
306 | 0 | { |
307 | 0 | return _encoding; |
308 | 0 | } |
309 | | |
310 | | |
311 | | inline NamespaceStrategy* ParserEngine::getNamespaceStrategy() const |
312 | 0 | { |
313 | 0 | return _pNamespaceStrategy; |
314 | 0 | } |
315 | | |
316 | | |
317 | | inline bool ParserEngine::getExpandInternalEntities() const |
318 | 0 | { |
319 | 0 | return _expandInternalEntities; |
320 | 0 | } |
321 | | |
322 | | |
323 | | inline bool ParserEngine::getExternalGeneralEntities() const |
324 | 0 | { |
325 | 0 | return _externalGeneralEntities; |
326 | 0 | } |
327 | | |
328 | | |
329 | | inline bool ParserEngine::getExternalParameterEntities() const |
330 | 0 | { |
331 | 0 | return _externalParameterEntities; |
332 | 0 | } |
333 | | |
334 | | |
335 | | inline EntityResolver* ParserEngine::getEntityResolver() const |
336 | 0 | { |
337 | 0 | return _pEntityResolver; |
338 | 0 | } |
339 | | |
340 | | |
341 | | inline DTDHandler* ParserEngine::getDTDHandler() const |
342 | 0 | { |
343 | 0 | return _pDTDHandler; |
344 | 0 | } |
345 | | |
346 | | |
347 | | inline DeclHandler* ParserEngine::getDeclHandler() const |
348 | 0 | { |
349 | 0 | return _pDeclHandler; |
350 | 0 | } |
351 | | |
352 | | |
353 | | inline ContentHandler* ParserEngine::getContentHandler() const |
354 | 0 | { |
355 | 0 | return _pContentHandler; |
356 | 0 | } |
357 | | |
358 | | |
359 | | inline LexicalHandler* ParserEngine::getLexicalHandler() const |
360 | 0 | { |
361 | 0 | return _pLexicalHandler; |
362 | 0 | } |
363 | | |
364 | | |
365 | | inline ErrorHandler* ParserEngine::getErrorHandler() const |
366 | 0 | { |
367 | 0 | return _pErrorHandler; |
368 | 0 | } |
369 | | |
370 | | |
371 | | inline bool ParserEngine::getEnablePartialReads() const |
372 | 0 | { |
373 | 0 | return _enablePartialReads; |
374 | 0 | } |
375 | | |
376 | | |
377 | | } } // namespace Poco::XML |
378 | | |
379 | | |
380 | | #endif // XML_ParserEngine_INCLUDED |