XMLReaderUtilsTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.ByteArrayInputStream;
import java.net.ConnectException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Test;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
/**
* Class to test that XMLReaderUtils defends against xxe and billion laughs.
* <p>
* Different versions and different implementations vary. This is not a fully comprehensive set of tests.
* <p>
* Please add more.
* <p>
* See also the tests with woodstox in tika-woodstox-tests.
*/
public class XMLReaderUtilsTest {
private static final Locale defaultLocale = Locale.getDefault();
static {
//tests on content of Exception msgs require specifying locale.
//even this, though is not sufficient for the billion laughs tests ?!
Locale.setDefault(Locale.US);
}
private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"tutorials.dtd\"><foo/>";
private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>";
private static final String EXTERNAL_ENTITY = "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" +
" ]><foo>&bar;</foo>";
private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
"<!ENTITY % local_dtd SYSTEM \"file:///usr/local/app/schema.dtd\">" +
"%local_dtd;]><foo/>";
private static final String BILLION_LAUGHS_CLASSICAL = "<?xml version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " <!ELEMENT lolz (#PCDATA)>\n" +
" <!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
" <!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
" <!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
" <!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
" <!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
" <!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
" <!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
" <!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + "<lolz>&lol9;</lolz>";
private static String BILLION_LAUGHS_VARIANT;
static {
StringBuilder entity = new StringBuilder();
for (int i = 0; i < 1000000; i++) {
entity.append("a");
}
StringBuilder xml = new StringBuilder();
xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + " <!ENTITY a \"");
xml.append(entity.toString());
xml.append("\">]>" + "<kaboom>");
for (int i = 0; i < 100000; i++) {
xml.append("&a;");
}
xml.append("</kaboom>");
BILLION_LAUGHS_VARIANT = xml.toString();
}
private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
private static final String[] BILLION_LAUGHS = new String[]{ BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
@AfterAll
public static void tearDown() {
Locale.setDefault(defaultLocale);
}
//make sure that parseSAX actually defends against external entities
@Test
public void testSAX() throws Exception {
for (String xml : EXTERNAL_ENTITY_XMLS) {
try {
XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
new ToTextContentHandler(), new ParseContext());
} catch (ConnectException e) {
fail("Parser tried to access resource: " + xml, e);
}
}
}
@Test
public void testDOM() throws Exception {
for (String xml : EXTERNAL_ENTITY_XMLS) {
try {
XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
} catch (ConnectException e) {
fail("Parser tried to access resource: " + xml, e);
}
}
}
@Test
public void testStax() throws Exception {
for (String xml : EXTERNAL_ENTITY_XMLS) {
try {
XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
StringBuilder sb = new StringBuilder();
while (reader.hasNext()) {
sb.append(reader.next());
}
if (sb.toString().contains("Exception scanning External")) {
fail("tried to read external dtd");
}
} catch (XMLStreamException e) {
fail("StreamException: " + xml, e);
} catch (NoSuchElementException e) {
if (e.getMessage() != null) {
if (e.getMessage().contains("Connection refused")) {
fail("Vulnerable to ssrf via url: " + xml, e);
} else if (e.getMessage().contains("No such file")) {
fail("Vulnerable to local file read via external entity/dtd: " + xml, e);
}
}
}
}
}
@Test
public void testSAXBillionLaughs() throws Exception {
for (String xml : BILLION_LAUGHS) {
try {
XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
new ToTextContentHandler(), new ParseContext());
} catch (SAXException e) {
limitCheck(e);
}
}
}
@Test
public void testDOMBillionLaughs() throws Exception {
//confirm that ExpandEntityReferences has been set to false.
//some implementations ignore the expandEntityReferences=false, and we are still
//protected by the "The parser has encountered more than "20" entity expansions" SAXException.
//We need to check for either: empty content and no exception, or this SAXException
for (String xml : BILLION_LAUGHS) {
Document doc = null;
try {
doc = XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
} catch (SAXException e) {
limitCheck(e);
continue;
}
NodeList nodeList = doc.getChildNodes();
StringBuilder sb = new StringBuilder();
dumpChildren(nodeList, sb);
assertEquals(0, sb
.toString()
.trim()
.length(), sb.toString());
}
}
private void dumpChildren(NodeList nodeList, StringBuilder sb) {
for (int i = 0; i < nodeList.getLength(); i++) {
Node n = nodeList.item(i);
String txt = n.getTextContent();
if (txt != null) {
sb.append(txt);
}
}
}
@Test
public void testStaxBillionLaughs() throws Exception {
/*
Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity expansions and
causes a "NoSuchElementException" with the "'lol9' was referenced but not declared" message with this line:
tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
If that line doesn't exist, then we get a
NoSuchElementException with: "The parser has encountered more than "20" entity expansions in this document; this is the limit imposed by the JDK."
*/
for (String xml : BILLION_LAUGHS) {
XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
try {
while (reader.hasNext()) {
reader.next();
}
} catch (NoSuchElementException e) {
//full message on temurin-17: The entity "lol9" was referenced, but not declared.
String msg = e.getLocalizedMessage();
if (msg != null) {
if (msg.contains("referenced") && msg.contains("not declared")) {
continue;
} else if (msg.contains("JAXP00010001")) {
continue;
}
}
throw e;
}
}
}
private void limitCheck(SAXException e) throws SAXException {
String msg = e.getLocalizedMessage();
if (msg == null) {
throw e;
}
//depending on the flavor/version of the jdk, entity expansions may be triggered
// OR entitySizeLimit may be triggered
//See TIKA-4471
if (msg.contains("JAXP00010001") || //entity expansions
msg.contains("JAXP00010003") || //max entity size limit
msg.contains("JAXP00010004") || //TotalEntitySizeLimit
msg.contains("entity expansions") ||
e.getMessage().contains("maxGeneralEntitySizeLimit")) {
return;
}
throw e;
}
}