XMLReaderUtilsFuzzer.java

// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;
import java.io.InputStream;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Comparator;
import java.util.stream.Stream;

import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
import org.apache.tika.utils.XMLReaderUtils;


class XMLReaderUtilsFuzzer {

    public static void fuzzerTestOneInput(byte[] bytes) throws Exception {
        try {
            parseOne(bytes);
        } catch (java.io.FileNotFoundException e) {
            //this should be rethrown because it could signal an XMLParser looking for a DTD
            throw e;
        } catch (TikaException | IOException | SAXException e) {
            e.printStackTrace();
        }
    }

    private static void parseOne(byte[] bytes) throws TikaException, IOException, SAXException {

        //dom
        try (InputStream is = TikaInputStream.get(bytes)) {
            Document doc = XMLReaderUtils.buildDOM(is, new ParseContext());
        } catch (SAXParseException e) {
            //swallow
        }
        //sax
        try (InputStream is = TikaInputStream.get(bytes)) {
            ToTextContentHandler toTextContentHandler = new ToTextContentHandler();
            XMLReaderUtils.parseSAX(is, toTextContentHandler, new ParseContext());
        } catch (SAXException e) {
            //swallow
        }

        //stax
        try (InputStream is = TikaInputStream.get(bytes)) {
            XMLStreamReader reader = XMLReaderUtils.getXMLInputFactory(new ParseContext())
                .createXMLStreamReader(is);
            while (reader.hasNext()) {
                reader.next();
            }
        } catch (java.util.MissingResourceException | XMLStreamException e) {
            //MissingResourceException can be thrown when an internal DTD has an InvalidCharInDTD
            //throw new TikaException("xml stream", e);
        }
    }
}