XMLReaderUtils.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.lang.reflect.Method;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLResolver;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXTransformerFactory;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
/**
* Utility functions for reading XML.
*/
public class XMLReaderUtils implements Serializable {
/**
* Default size for the pool of SAX Parsers
* and the pool of DOM builders
*/
public static final int DEFAULT_POOL_SIZE = 10;
public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
public static final int DEFAULT_NUM_REUSES = 100;
/**
* Serial version UID
*/
private static final long serialVersionUID = 6110455808615143122L;
private static final Logger LOG = LoggerFactory.getLogger(XMLReaderUtils.class);
private static final String XERCES_SECURITY_MANAGER = "org.apache.xerces.util.SecurityManager";
private static final String XERCES_SECURITY_MANAGER_PROPERTY =
"http://apache.org/xml/properties/security-manager";
private static final AtomicBoolean HAS_WARNED_STAX = new AtomicBoolean(false);
private static final ContentHandler IGNORING_CONTENT_HANDLER = new DefaultHandler();
private static final DTDHandler IGNORING_DTD_HANDLER = new DTDHandler() {
@Override
public void notationDecl(String name, String publicId, String systemId)
throws SAXException {
}
@Override
public void unparsedEntityDecl(String name, String publicId, String systemId,
String notationName) throws SAXException {
}
};
private static final ErrorHandler IGNORING_ERROR_HANDLER = new ErrorHandler() {
@Override
public void warning(SAXParseException exception) throws SAXException {
}
@Override
public void error(SAXParseException exception) throws SAXException {
}
@Override
public void fatalError(SAXParseException exception) throws SAXException {
}
};
private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
//TODO: figure out if the rw lock is any better than a simple lock
//these lock the pool arrayblocking queues so that there isn't a race condition
//of trying to acquire a parser while the pool is being resized
private static final ReentrantReadWriteLock SAX_POOL_LOCK = new ReentrantReadWriteLock();
private static final ReentrantReadWriteLock DOM_POOL_LOCK = new ReentrantReadWriteLock();
private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER =
(publicId, systemId) -> new InputSource(new StringReader(""));
//BE CAREFUL with the return type. Some parsers will silently ignore an unexpected return type: CVE-2025-54988
private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
(publicID, systemID, baseURI, namespace) ->
UnsynchronizedByteArrayInputStream.nullInputStream();
/**
* Parser pool size
*/
private static int POOL_SIZE = DEFAULT_POOL_SIZE;
private static int MAX_NUM_REUSES = DEFAULT_NUM_REUSES;
private static long LAST_LOG = -1;
private static volatile int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
private static ArrayBlockingQueue<PoolSAXParser> SAX_PARSERS =
new ArrayBlockingQueue<>(POOL_SIZE);
private static ArrayBlockingQueue<PoolDOMBuilder> DOM_BUILDERS =
new ArrayBlockingQueue<>(POOL_SIZE);
static {
try {
setPoolSize(POOL_SIZE);
} catch (TikaException e) {
throw new RuntimeException("problem initializing SAXParser and DOMBuilder pools", e);
}
}
private static int determineMaxEntityExpansions() {
String expansionLimit = System.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY);
if (expansionLimit != null) {
try {
return Integer.parseInt(expansionLimit);
} catch (NumberFormatException e) {
LOG.warn(
"Couldn't parse an integer for the entity expansion limit: {}; " +
"backing off to default: {}",
expansionLimit, DEFAULT_MAX_ENTITY_EXPANSIONS);
}
}
return DEFAULT_MAX_ENTITY_EXPANSIONS;
}
/**
* Returns the XMLReader specified in this parsing context. If a reader
* is not explicitly specified, then one is created using the specified
* or the default SAX parser.
*
* @return XMLReader
* @throws TikaException
* @see #getSAXParser()
* @since Apache Tika 1.13
*/
public static XMLReader getXMLReader() throws TikaException {
XMLReader reader;
try {
reader = getSAXParser().getXMLReader();
} catch (SAXException e) {
throw new TikaException("Unable to create an XMLReader", e);
}
reader.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
return reader;
}
/**
* Returns the SAX parser specified in this parsing context. If a parser
* is not explicitly specified, then one is created using the specified
* or the default SAX parser factory.
* <p>
* If you call reset() on the parser, make sure to replace the
* SecurityManager which will be cleared by xerces2 on reset().
* </p>
*
* @return SAX parser
* @throws TikaException if a SAX parser could not be created
* @see #getSAXParserFactory()
* @since Apache Tika 0.8
*/
public static SAXParser getSAXParser() throws TikaException {
try {
SAXParser parser = getSAXParserFactory().newSAXParser();
trySetXercesSecurityManager(parser);
return parser;
} catch (ParserConfigurationException e) {
throw new TikaException("Unable to configure a SAX parser", e);
} catch (SAXException e) {
throw new TikaException("Unable to create a SAX parser", e);
}
}
/**
* Returns the SAX parser factory specified in this parsing context.
* If a factory is not explicitly specified, then a default factory
* instance is created and returned. The default factory instance is
* configured to be namespace-aware, not validating, and to use
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return SAX parser factory
* @since Apache Tika 0.8
*/
public static SAXParserFactory getSAXParserFactory() {
SAXParserFactory factory = SAXParserFactory.newInstance();
if (LOG.isDebugEnabled()) {
LOG.debug("SAXParserFactory class {}", factory.getClass());
}
factory.setNamespaceAware(true);
factory.setValidating(false);
trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
false);
return factory;
}
/**
* Returns the DOM builder factory specified in this parsing context.
* If a factory is not explicitly specified, then a default factory
* instance is created and returned. The default factory instance is
* configured to be namespace-aware and to apply reasonable security
* features.
*
* @return DOM parser factory
* @since Apache Tika 1.13
*/
public static DocumentBuilderFactory getDocumentBuilderFactory() {
//borrowed from Apache POI
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
if (LOG.isDebugEnabled()) {
LOG.debug("DocumentBuilderFactory class {}", factory.getClass());
}
factory.setExpandEntityReferences(false);
factory.setNamespaceAware(true);
factory.setValidating(false);
trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
false);
trySetXercesSecurityManager(factory);
return factory;
}
/**
* Returns the DOM builder specified in this parsing context.
* If a builder is not explicitly specified, then a builder
* instance is created and returned. The builder instance is
* configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
* and it sets the ErrorHandler to <code>null</code>.
*
* @return DOM Builder
* @since Apache Tika 1.13
*/
public static DocumentBuilder getDocumentBuilder() throws TikaException {
try {
DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
documentBuilder.setErrorHandler(null);
return documentBuilder;
} catch (ParserConfigurationException e) {
throw new TikaException("XML parser not available", e);
}
}
/**
* Returns the StAX input factory specified in this parsing context.
* If a factory is not explicitly specified, then a default factory
* instance is created and returned. The default factory instance is
* configured to be namespace-aware and to apply reasonable security
* precautions.
*
* @return StAX input factory
* @since Apache Tika 1.13
*/
public static XMLInputFactory getXMLInputFactory() {
XMLInputFactory factory = XMLInputFactory.newFactory();
if (LOG.isDebugEnabled()) {
LOG.debug("XMLInputFactory class {}", factory.getClass());
}
tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
//try to configure secure processing
tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
tryToSetStaxProperty(factory, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
//defense in depth
factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
trySetStaxSecurityManager(factory);
return factory;
}
private static void trySetTransformerAttribute(TransformerFactory transformerFactory,
String attribute, String value) {
try {
transformerFactory.setAttribute(attribute, value);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
LOG.warn("Transformer Attribute unsupported: {}", attribute, e);
} catch (AbstractMethodError ame) {
LOG.warn(
"Cannot set Transformer attribute because outdated XML parser in classpath: {}",
attribute, ame);
}
}
private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String feature,
boolean enabled) {
try {
saxParserFactory.setFeature(feature, enabled);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
LOG.warn("SAX Feature unsupported: {}", feature, e);
} catch (AbstractMethodError ame) {
LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature,
ame);
}
}
private static void trySetSAXFeature(DocumentBuilderFactory documentBuilderFactory,
String feature, boolean enabled) {
try {
documentBuilderFactory.setFeature(feature, enabled);
} catch (Exception e) {
LOG.warn("SAX Feature unsupported: {}", feature, e);
} catch (AbstractMethodError ame) {
LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature,
ame);
}
}
private static void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
try {
factory.setProperty(key, value);
} catch (IllegalArgumentException e) {
LOG.warn("StAX Feature unsupported: {}", key, e);
}
}
private static void tryToSetStaxProperty(XMLInputFactory factory, String key, String value) {
try {
factory.setProperty(key, value);
} catch (IllegalArgumentException e) {
LOG.warn("StAX Feature unsupported: {}", key, e);
}
}
/**
* Returns a new transformer
* <p>
* The transformer instance is configured to to use
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return Transformer
* @throws TikaException when the transformer can not be created
* @since Apache Tika 1.17
*/
public static Transformer getTransformer() throws TikaException {
TransformerFactory transformerFactory = getTransformerFactory();
try {
return transformerFactory.newTransformer();
} catch (TransformerConfigurationException e) {
throw new TikaException("Transformer not available", e);
}
}
/**
* Returns a TransformerFactory. The factory is configured with
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other
* settings to prevent XXE.
*
* @return TransformerFactory
* @throws TikaException
*/
public static TransformerFactory getTransformerFactory() throws TikaException {
try {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
return transformerFactory;
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
throw new TikaException("Transformer not available", e);
}
}
/**
* Returns a SAXTransformerFactory. The factory is configured with
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other
* settings to prevent XXE.
*
* @return TransformerFactory
* @throws TikaException
*/
public static SAXTransformerFactory getSAXTransformerFactory() throws TikaException {
try {
SAXTransformerFactory transformerFactory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
return transformerFactory;
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
throw new TikaException("Transformer not available", e);
}
}
/**
* This checks context for a user specified {@link DocumentBuilder}.
* If one is not found, this reuses a DocumentBuilder from the pool.
*
* @param is InputStream to parse
* @param context context to use
* @return a document
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 1.19
*/
public static Document buildDOM(InputStream is, ParseContext context)
throws TikaException, IOException, SAXException {
DocumentBuilder builder = context.get(DocumentBuilder.class);
PoolDOMBuilder poolBuilder = null;
if (builder == null) {
if (POOL_SIZE == 0) {
builder = getDocumentBuilder();
} else {
poolBuilder = acquireDOMBuilder();
if (poolBuilder != null) {
builder = poolBuilder.getDocumentBuilder();
} else {
builder = getDocumentBuilder();
}
}
}
try {
return builder.parse(is);
} finally {
releaseDOMBuilder(poolBuilder);
}
}
/**
* This checks context for a user specified {@link DocumentBuilder}.
* If one is not found, this reuses a DocumentBuilder from the pool.
*
* @param reader reader (character stream) to parse
* @param context context to use
* @return a document
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 2.5
*/
public static Document buildDOM(Reader reader, ParseContext context)
throws TikaException, IOException, SAXException {
DocumentBuilder builder = context.get(DocumentBuilder.class);
PoolDOMBuilder poolBuilder = null;
if (builder == null) {
if (POOL_SIZE == 0) {
builder = getDocumentBuilder();
} else {
poolBuilder = acquireDOMBuilder();
if (poolBuilder != null) {
builder = poolBuilder.getDocumentBuilder();
} else {
builder = getDocumentBuilder();
}
}
}
try {
return builder.parse(new InputSource(reader));
} finally {
releaseDOMBuilder(poolBuilder);
}
}
/**
* Builds a Document with a DocumentBuilder from the pool
*
* @param path path to parse
* @return a document
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 1.19.1
*/
public static Document buildDOM(Path path) throws TikaException, IOException, SAXException {
try (InputStream is = Files.newInputStream(path)) {
return buildDOM(is);
}
}
/**
* Builds a Document with a DocumentBuilder from the pool
*
* @param uriString uriString to process
* @return a document
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 1.19.1
*/
public static Document buildDOM(String uriString)
throws TikaException, IOException, SAXException {
PoolDOMBuilder poolBuilder = null;
DocumentBuilder builder = null;
if (POOL_SIZE == 0) {
builder = getDocumentBuilder();
} else {
poolBuilder = acquireDOMBuilder();
if (poolBuilder != null) {
builder = poolBuilder.getDocumentBuilder();
} else {
builder = getDocumentBuilder();
}
}
try {
return builder.parse(uriString);
} finally {
releaseDOMBuilder(poolBuilder);
}
}
/**
* Builds a Document with a DocumentBuilder from the pool
*
* @return a document
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 1.19.1
*/
public static Document buildDOM(InputStream is)
throws TikaException, IOException, SAXException {
PoolDOMBuilder poolBuilder = null;
DocumentBuilder builder = null;
if (POOL_SIZE == 0) {
builder = getDocumentBuilder();
} else {
poolBuilder = acquireDOMBuilder();
if (poolBuilder != null) {
builder = poolBuilder.getDocumentBuilder();
} else {
builder = getDocumentBuilder();
}
}
try {
return builder.parse(is);
} finally {
releaseDOMBuilder(poolBuilder);
}
}
/**
* This checks context for a user specified {@link SAXParser}.
* If one is not found, this reuses a SAXParser from the pool.
*
* @param is InputStream to parse
* @param contentHandler handler to use; this wraps a {@link OfflineContentHandler}
* to the content handler as an extra layer of defense against
* external entity vulnerabilities
* @param context context to use
* @return
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 1.19
*/
public static void parseSAX(InputStream is, ContentHandler contentHandler, ParseContext context)
throws TikaException, IOException, SAXException {
SAXParser saxParser = context.get(SAXParser.class);
PoolSAXParser poolSAXParser = null;
if (saxParser == null) {
if (POOL_SIZE == 0) {
saxParser = getSAXParser();
} else {
poolSAXParser = acquireSAXParser();
if (poolSAXParser != null) {
saxParser = poolSAXParser.getSAXParser();
} else {
saxParser = getSAXParser();
}
}
}
try {
saxParser.parse(is, new OfflineContentHandler(contentHandler));
} finally {
releaseParser(poolSAXParser);
}
}
/**
* This checks context for a user specified {@link SAXParser}.
* If one is not found, this reuses a SAXParser from the pool.
*
* @param reader reader (character stream) to parse
* @param contentHandler handler to use; this wraps a {@link OfflineContentHandler}
* to the content handler as an extra layer of defense against
* external entity vulnerabilities
* @param context context to use
* @return
* @throws TikaException
* @throws IOException
* @throws SAXException
* @since Apache Tika 2.5
*/
public static void parseSAX(Reader reader, ContentHandler contentHandler, ParseContext context)
throws TikaException, IOException, SAXException {
SAXParser saxParser = context.get(SAXParser.class);
PoolSAXParser poolSAXParser = null;
if (saxParser == null) {
if (POOL_SIZE == 0) {
saxParser = getSAXParser();
} else {
poolSAXParser = acquireSAXParser();
if (poolSAXParser != null) {
saxParser = poolSAXParser.getSAXParser();
} else {
saxParser = getSAXParser();
}
}
}
try {
saxParser.parse(new InputSource(reader), new OfflineContentHandler(contentHandler));
} finally {
releaseParser(poolSAXParser);
}
}
/**
* Acquire a DOMBuilder from the pool. Make sure to
* {@link #releaseDOMBuilder(PoolDOMBuilder)} in
* a <code>finally</code> block every time you call this.
*
* @return a DocumentBuilder or null if no DOMBuilders are available
* @throws TikaException
*/
private static PoolDOMBuilder acquireDOMBuilder() throws TikaException {
PoolDOMBuilder builder = null;
DOM_POOL_LOCK
.readLock()
.lock();
try {
builder = DOM_BUILDERS.poll();
} finally {
DOM_POOL_LOCK
.readLock()
.unlock();
}
if (builder == null) {
LOG.warn("Contention waiting for a DOMBuilder. " +
"Consider increasing the XMLReaderUtils.POOL_SIZE");
}
return builder;
}
/**
* Return parser to the pool for reuse.
*
* @param builder builder to return
*/
private static void releaseDOMBuilder(PoolDOMBuilder builder) {
if (builder == null) {
return;
}
if (builder.getPoolGeneration() != POOL_GENERATION.get()) {
return;
}
try {
builder.reset();
} catch (UnsupportedOperationException e) {
//ignore
}
DOM_POOL_LOCK
.readLock().lock();
builder.incrementUses();
if (builder.numUses >= MAX_NUM_REUSES) {
try {
builder = new PoolDOMBuilder(builder.getPoolGeneration(), getDocumentBuilderFactory().newDocumentBuilder());
} catch (ParserConfigurationException e) {
LOG.warn("Exception trying to configure a new dom builder?!", e);
return;
}
}
try {
//if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
boolean success = DOM_BUILDERS.offer(builder);
if (!success) {
LOG.warn(
"DocumentBuilder not taken back into pool. If you haven't resized the " +
"pool, this could be a sign that there are more calls to " +
"'acquire' than to 'release'");
}
} finally {
DOM_POOL_LOCK
.readLock().unlock();
}
}
/**
* Acquire a SAXParser from the pool. Make sure to
* {@link #releaseParser(PoolSAXParser)} in
* a <code>finally</code> block every time you call this.
*
* @return a SAXParser or null if a parser is not available
* @throws TikaException
*/
private static PoolSAXParser acquireSAXParser() throws TikaException {
PoolSAXParser parser = null;
//this locks around the pool so that there's
//no race condition with it being resized
SAX_POOL_LOCK
.readLock()
.lock();
try {
parser = SAX_PARSERS.poll();
} finally {
SAX_POOL_LOCK
.readLock()
.unlock();
}
if (parser == null) {
LOG.warn("Contention waiting for a SAXParser. " +
"Consider increasing the XMLReaderUtils.POOL_SIZE");
}
return parser;
}
/**
* Return parser to the pool for reuse
*
* @param parser parser to return
*/
private static void releaseParser(PoolSAXParser parser) {
if (parser == null) {
return;
}
try {
parser.reset();
} catch (UnsupportedOperationException e) {
//TIKA-3009 -- we really shouldn't have to do this... :(
}
//if this is a different generation, don't put it back
//in the pool
if (parser.getGeneration() != POOL_GENERATION.get()) {
return;
}
SAX_POOL_LOCK
.readLock().lock();
try {
parser.incrementUses();
if (parser.numUses >= MAX_NUM_REUSES) {
try {
parser = buildPoolParser(parser.getGeneration(), getSAXParserFactory().newSAXParser());
} catch (SAXException | ParserConfigurationException e) {
LOG.warn("Couldn't build new SAXParser after hitting max reuses", e);
return;
}
}
//if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
boolean success = SAX_PARSERS.offer(parser);
if (!success) {
LOG.warn(
"SAXParser not taken back into pool. If you haven't resized the pool " +
"this could be a sign that there are more calls to 'acquire' " +
"than to 'release'");
}
} finally {
SAX_POOL_LOCK
.readLock().unlock();
}
}
private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) {
//from POI
// Try built-in JVM one first, standalone if not
for (String securityManagerClassName : new String[]{
//"com.sun.org.apache.xerces.internal.util.SecurityManager",
XERCES_SECURITY_MANAGER}) {
try {
Object mgr =
Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit",
Integer.TYPE);
setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
factory.setAttribute(XERCES_SECURITY_MANAGER_PROPERTY, mgr);
// Stop once one can be setup without error
return;
} catch (ClassNotFoundException e) {
// continue without log, this is expected in some setups
} catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
"SAX Security Manager could not be setup [log suppressed for 5 " +
"minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
}
// separate old version of Xerces not found => use the builtin way of setting the property
try {
factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) {
// NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
}
private static void trySetXercesSecurityManager(SAXParser parser) {
//from POI
// Try built-in JVM one first, standalone if not
for (String securityManagerClassName : new String[]{
//"com.sun.org.apache.xerces.internal.util.SecurityManager",
XERCES_SECURITY_MANAGER}) {
try {
Object mgr =
Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
parser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, mgr);
// Stop once one can be setup without error
return;
} catch (ClassNotFoundException e) {
// continue without log, this is expected in some setups
} catch (Throwable e) {
// NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
"SAX Security Manager could not be setup [log suppressed for 5 " +
"minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
}
// separate old version of Xerces not found => use the builtin way of setting the property
try {
parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
} catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
}
private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
//try default java entity expansion, then fallback to woodstox, then warn...once.
try {
inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) {
try {
inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e2) {
if (HAS_WARNED_STAX.getAndSet(true) == false) {
LOG.warn("Could not set limit on maximum entity expansions for: " + inputFactory.getClass());
}
}
}
}
/**
* Get the maximum number of times a SAXParser or DOMBuilder may be reused.
*
* @return
*/
public static int getMaxNumReuses() {
return MAX_NUM_REUSES;
}
public static void setMaxNumReuses(int maxNumReuses) {
MAX_NUM_REUSES = maxNumReuses;
}
public static int getPoolSize() {
return POOL_SIZE;
}
/**
* Set the pool size for cached XML parsers. This has a side
* effect of locking the pool, and rebuilding the pool from
* scratch with the most recent settings, such as {@link #MAX_ENTITY_EXPANSIONS}
*
* As of Tika 3.2.1, if a value of <code>0</code> is passed in, no SAXParsers or DOMBuilders
* will be pooled, and a new parser/builder will be built for each parse.
*
* @param poolSize
* @since Apache Tika 1.19
*/
public static void setPoolSize(int poolSize) throws TikaException {
if (poolSize < 0) {
throw new IllegalArgumentException("PoolSize must be >= 0");
}
//stop the world with a write lock.
//parsers that are currently in use will be offered later (once the lock is released),
//but not accepted and will be gc'd. We have to do this locking and
//the read locking in case one thread resizes the pool when the
//parsers have already started. We could have an NPE on SAX_PARSERS
//if we didn't lock.
SAX_POOL_LOCK
.writeLock().lock();
try {
//free up any resources before emptying SAX_PARSERS
for (PoolSAXParser parser : SAX_PARSERS) {
parser.reset();
}
SAX_PARSERS.clear();
if (poolSize > 0) {
SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
int generation = POOL_GENERATION.incrementAndGet();
for (int i = 0; i < poolSize; i++) {
try {
SAX_PARSERS.offer(buildPoolParser(generation, getSAXParserFactory().newSAXParser()));
} catch (SAXException | ParserConfigurationException e) {
throw new TikaException("problem creating sax parser", e);
}
}
}
} finally {
SAX_POOL_LOCK
.writeLock().unlock();
}
DOM_POOL_LOCK
.writeLock().lock();
try {
DOM_BUILDERS.clear();
if (poolSize > 0) {
DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
for (int i = 0; i < poolSize; i++) {
DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder()));
}
}
} finally {
DOM_POOL_LOCK
.writeLock().unlock();
}
POOL_SIZE = poolSize;
}
public static int getMaxEntityExpansions() {
return MAX_ENTITY_EXPANSIONS;
}
/**
* Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
* <b>NOTE:</b>A value less than or equal to zero indicates no limit.
* This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
* and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for allowable entity expansions
* <p>
* <b>NOTE:</b> To trigger a rebuild of the pool of parsers with this setting,
* the client must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers
* with this setting.
* </p>
*
* @param maxEntityExpansions -- maximum number of allowable entity expansions
* @since Apache Tika 1.19
*/
public static void setMaxEntityExpansions(int maxEntityExpansions) {
MAX_ENTITY_EXPANSIONS = maxEntityExpansions;
}
/**
* @param localName
* @param atts
* @return attribute value with that local name or <code>null</code> if not found
*/
public static String getAttrValue(String localName, Attributes atts) {
for (int i = 0; i < atts.getLength(); i++) {
if (localName.equals(atts.getLocalName(i))) {
return atts.getValue(i);
}
}
return null;
}
private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) {
boolean canReset = false;
try {
parser.reset();
canReset = true;
} catch (UnsupportedOperationException e) {
canReset = false;
}
boolean hasSecurityManager = false;
try {
Object mgr =
Class.forName(XERCES_SECURITY_MANAGER).getDeclaredConstructor().newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
parser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, mgr);
hasSecurityManager = true;
} catch (SecurityException e) {
//don't swallow security exceptions
throw e;
} catch (ClassNotFoundException e) {
// continue without log, this is expected in some setups
} catch (Throwable e) {
// NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
boolean canSetJaxPEntity = false;
if (!hasSecurityManager) {
// use the builtin way of setting the property
try {
parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
canSetJaxPEntity = true;
} catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
"SAX Security Manager could not be setup [log suppressed for 5 " +
"minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
}
}
if (!canReset && hasSecurityManager) {
return new XercesPoolSAXParser(generation, parser);
} else if (canReset && hasSecurityManager) {
return new Xerces2PoolSAXParser(generation, parser);
} else if (canReset && !hasSecurityManager && canSetJaxPEntity) {
return new BuiltInPoolSAXParser(generation, parser);
} else {
return new UnrecognizedPoolSAXParser(generation, parser);
}
}
private static void clearReader(XMLReader reader) {
if (reader == null) {
return;
}
reader.setContentHandler(IGNORING_CONTENT_HANDLER);
reader.setDTDHandler(IGNORING_DTD_HANDLER);
reader.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
reader.setErrorHandler(IGNORING_ERROR_HANDLER);
}
private static class PoolDOMBuilder {
private final int poolGeneration;
private final DocumentBuilder documentBuilder;
int numUses = 0;
PoolDOMBuilder(int poolGeneration, DocumentBuilder documentBuilder) {
this.poolGeneration = poolGeneration;
this.documentBuilder = documentBuilder;
}
public int getPoolGeneration() {
return poolGeneration;
}
public DocumentBuilder getDocumentBuilder() {
return documentBuilder;
}
public void reset() {
documentBuilder.reset();
documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
documentBuilder.setErrorHandler(null);
}
void incrementUses() {
numUses = 0;
}
}
private abstract static class PoolSAXParser {
final int poolGeneration;
final SAXParser saxParser;
int numUses = 0;
PoolSAXParser(int poolGeneration, SAXParser saxParser) {
this.poolGeneration = poolGeneration;
this.saxParser = saxParser;
}
abstract void reset();
public int getGeneration() {
return poolGeneration;
}
public SAXParser getSAXParser() {
return saxParser;
}
void incrementUses() {
numUses++;
}
}
private static class XercesPoolSAXParser extends PoolSAXParser {
public XercesPoolSAXParser(int generation, SAXParser parser) {
super(generation, parser);
}
@Override
public void reset() {
//don't do anything
try {
XMLReader reader = saxParser.getXMLReader();
clearReader(reader);
} catch (SAXException e) {
//swallow
}
}
}
private static class Xerces2PoolSAXParser extends PoolSAXParser {
public Xerces2PoolSAXParser(int generation, SAXParser parser) {
super(generation, parser);
}
@Override
void reset() {
try {
Object object = saxParser.getProperty(XERCES_SECURITY_MANAGER_PROPERTY);
saxParser.reset();
saxParser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, object);
} catch (SAXException e) {
LOG.warn("problem resetting sax parser", e);
}
try {
XMLReader reader = saxParser.getXMLReader();
clearReader(reader);
} catch (SAXException e) {
// ignored
}
}
}
private static class BuiltInPoolSAXParser extends PoolSAXParser {
public BuiltInPoolSAXParser(int generation, SAXParser parser) {
super(generation, parser);
}
@Override
void reset() {
saxParser.reset();
try {
XMLReader reader = saxParser.getXMLReader();
clearReader(reader);
} catch (SAXException e) {
// ignored
}
}
}
private static class UnrecognizedPoolSAXParser extends PoolSAXParser {
//if unrecognized, try to set all protections
//and try to reset every time
public UnrecognizedPoolSAXParser(int generation, SAXParser parser) {
super(generation, parser);
}
@Override
void reset() {
try {
saxParser.reset();
} catch (UnsupportedOperationException e) {
// ignored
}
try {
XMLReader reader = saxParser.getXMLReader();
clearReader(reader);
} catch (SAXException e) {
// ignored
}
trySetXercesSecurityManager(saxParser);
}
}
/**
* Returns the DOM builder specified in this parsing context.
* If a builder is not explicitly specified, then a builder
* instance is created and returned. The builder instance is
* configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
* and it sets the ErrorHandler to <code>null</code>.
* Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)}
* instead for more efficient reuse of document builders.
*
* @return DOM Builder
*/
public static DocumentBuilder getDocumentBuilder(ParseContext context) throws TikaException {
DocumentBuilder documentBuilder = context.get(DocumentBuilder.class);
if (documentBuilder != null) {
return documentBuilder;
} else {
return XMLReaderUtils.getDocumentBuilder();
}
}
/**
* Returns the StAX input factory specified in this parsing context.
* If a factory is not explicitly specified, then a default factory
* instance is created and returned. The default factory instance is
* configured to be namespace-aware and to apply reasonable security
* precautions.
*
* @return StAX input factory
*/
public static XMLInputFactory getXMLInputFactory(ParseContext context) {
XMLInputFactory factory = context.get(XMLInputFactory.class);
if (factory != null) {
return factory;
}
return XMLReaderUtils.getXMLInputFactory();
}
/**
* Returns the transformer specified in this parsing context.
* <p>
* If a transformer is not explicitly specified, then a default transformer
* instance is created and returned. The default transformer instance is
* configured to to use
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return Transformer
* @throws TikaException when the transformer can not be created
*/
public static Transformer getTransformer(ParseContext context) throws TikaException {
Transformer transformer = context.get(Transformer.class);
if (transformer != null) {
return transformer;
}
return XMLReaderUtils.getTransformer();
}
}