WireFeedInput.java

/*
 * Copyright 2004 Sun Microsystems, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.rometools.rome.io;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.WeakHashMap;

import org.jdom2.Document;
import org.jdom2.JDOMException;
import org.jdom2.input.DOMBuilder;
import org.jdom2.input.JDOMParseException;
import org.jdom2.input.sax.XMLReaders;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;

import com.rometools.rome.feed.WireFeed;
import com.rometools.rome.feed.impl.ConfigurableClassLoader;
import com.rometools.rome.io.impl.FeedParsers;
import com.rometools.rome.io.impl.XmlFixerReader;

/**
 * Parses an XML document (File, InputStream, Reader, W3C SAX InputSource, W3C DOM Document or JDom
 * DOcument) into an WireFeed (RSS/Atom).
 * <p>
 * It accepts all flavors of RSS (0.90, 0.91, 0.92, 0.93, 0.94, 1.0 and 2.0) and Atom 0.3 feeds.
 * Parsers are plugable (they must implement the WireFeedParser interface).
 * <p>
 * The WireFeedInput useds liberal parsers.
 */
public class WireFeedInput {

    private static final InputSource EMPTY_INPUTSOURCE = new InputSource(new ByteArrayInputStream(new byte[0]));
    private static final EntityResolver RESOLVER = new EmptyEntityResolver();

    private static Map<ClassLoader, FeedParsers> clMap = new WeakHashMap<ClassLoader, FeedParsers>();

    private final boolean validate;
    private final Locale locale;

    private boolean xmlHealerOn;
    private boolean allowDoctypes = false;

    private static FeedParsers getFeedParsers() {
        synchronized (WireFeedInput.class) {
            final ClassLoader classLoader = ConfigurableClassLoader.INSTANCE.getClassLoader();
            FeedParsers parsers = clMap.get(classLoader);
            if (parsers == null) {
                parsers = new FeedParsers();
                clMap.put(classLoader, parsers);
            }
            return parsers;
        }
    }

    private static class EmptyEntityResolver implements EntityResolver {
        @Override
        public InputSource resolveEntity(final String publicId, final String systemId) {
            if (systemId != null && systemId.endsWith(".dtd")) {
                return EMPTY_INPUTSOURCE;
            }
            return null;
        }
    }

    /**
     * Returns the list of supported input feed types.
     * <p>
     *
     * @see WireFeed for details on the format of these strings.
     *      <p>
     * @return a list of String elements with the supported input feed types.
     *
     */
    public static List<String> getSupportedFeedTypes() {
        return getFeedParsers().getSupportedFeedTypes();
    }

    /**
     * Creates a WireFeedInput instance with input validation turned off.
     * <p>
     *
     */
    public WireFeedInput() {
        this(false, Locale.US);
    }

    /**
     * Creates a WireFeedInput instance.
     * <p>
     *
     * @param validate indicates if the input should be validated. NOT IMPLEMENTED YET (validation
     *            does not happen)
     *
     */
    public WireFeedInput(final boolean validate, final Locale locale) {
        this.validate = false; // TODO FIX THIS THINGY
        xmlHealerOn = true;
        this.locale = locale;
    }

    /**
     * Enables XML healing in the WiredFeedInput instance.
     * <p>
     * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
     * <p>
     * Healing resolves HTML entities (from literal to code number) in the reader.
     * <p>
     * The healing is done only with the build(File) and build(Reader) signatures.
     * <p>
     * By default is TRUE.
     * <p>
     *
     * @param heals TRUE enables stream healing, FALSE disables it.
     *
     */
    public void setXmlHealerOn(final boolean heals) {
        xmlHealerOn = heals;
    }

    /**
     * Indicates if the WiredFeedInput instance will XML heal (if necessary) the character stream.
     * <p>
     * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
     * <p>
     * Healing resolves HTML entities (from literal to code number) in the reader.
     * <p>
     * The healing is done only with the build(File) and build(Reader) signatures.
     * <p>
     * By default is TRUE.
     * <p>
     *
     * @return TRUE if healing is enabled, FALSE if not.
     *
     */
    public boolean getXmlHealerOn() {
        return xmlHealerOn;
    }
    
    /**
     * Indicates whether Doctype declarations are allowed.
     *  
     * @return true when Doctype declarations are allowed, false otherwise
     */
    public boolean isAllowDoctypes() {
        return allowDoctypes;
    }

    /**
     * Since ROME 1.5.1 we fixed a security vulnerability by disallowing Doctype declarations by default. 
     * This change breaks the compatibility with at least RSS 0.91N because it requires a Doctype declaration. 
     * You are able to allow Doctype declarations again with this property. You should only activate it 
     * when the feeds that you process are absolutely trustful. 
     *  
     * @param allowDoctypes true when Doctype declarations should be allowed again, false otherwise
     */
    public void setAllowDoctypes(boolean allowDoctypes) {
        this.allowDoctypes = allowDoctypes;
    }

    /**
     * Builds an WireFeed (RSS or Atom) from a file.
     * <p>
     * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom2.Document)'.
     * <p>
     *
     * @param file file to read to create the WireFeed.
     * @return the WireFeed read from the file.
     * @throws FileNotFoundException thrown if the file could not be found.
     * @throws IOException thrown if there is problem reading the file.
     * @throws IllegalArgumentException thrown if feed type could not be understood by any of the
     *             underlying parsers.
     * @throws FeedException if the feed could not be parsed
     *
     */
    public WireFeed build(final File file) throws FileNotFoundException, IOException, IllegalArgumentException, FeedException {
        WireFeed feed;
        Reader reader = new FileReader(file);
        try {
            if (xmlHealerOn) {
                reader = new XmlFixerReader(reader);
            }
            feed = this.build(reader);
        } finally {
            reader.close();
        }
        return feed;
    }

    /**
     * Builds an WireFeed (RSS or Atom) from an Reader.
     * <p>
     * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom2.Document)'.
     * <p>
     *
     * @param reader Reader to read to create the WireFeed.
     * @return the WireFeed read from the Reader.
     * @throws IllegalArgumentException thrown if feed type could not be understood by any of the
     *             underlying parsers.
     * @throws FeedException if the feed could not be parsed
     *
     */
    public WireFeed build(Reader reader) throws IllegalArgumentException, FeedException {
        final SAXBuilder saxBuilder = createSAXBuilder();
        try {
            if (xmlHealerOn) {
                reader = new XmlFixerReader(reader);
            }
            final Document document = saxBuilder.build(reader);
            return this.build(document);
        } catch (final JDOMParseException ex) {
            throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
        } catch (final IllegalArgumentException ex) {
            throw ex;
        } catch (final Exception ex) {
            throw new ParsingFeedException("Invalid XML", ex);
        }
    }

    /**
     * Builds an WireFeed (RSS or Atom) from an W3C SAX InputSource.
     * <p>
     * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom2.Document)'.
     * <p>
     *
     * @param is W3C SAX InputSource to read to create the WireFeed.
     * @return the WireFeed read from the W3C SAX InputSource.
     * @throws IllegalArgumentException thrown if feed type could not be understood by any of the
     *             underlying parsers.
     * @throws FeedException if the feed could not be parsed
     *
     */
    public WireFeed build(final InputSource is) throws IllegalArgumentException, FeedException {
        final SAXBuilder saxBuilder = createSAXBuilder();
        try {
            final Document document = saxBuilder.build(is);
            return this.build(document);
        } catch (final JDOMParseException ex) {
            throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
        } catch (final IllegalArgumentException ex) {
            throw ex;
        } catch (final Exception ex) {
            throw new ParsingFeedException("Invalid XML", ex);
        }
    }

    /**
     * Builds an WireFeed (RSS or Atom) from an W3C DOM document.
     * <p>
     * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom2.Document)'.
     * <p>
     *
     * @param document W3C DOM document to read to create the WireFeed.
     * @return the WireFeed read from the W3C DOM document.
     * @throws IllegalArgumentException thrown if feed type could not be understood by any of the
     *             underlying parsers.
     * @throws FeedException if the feed could not be parsed
     *
     */
    public WireFeed build(final org.w3c.dom.Document document) throws IllegalArgumentException, FeedException {
        final DOMBuilder domBuilder = new DOMBuilder();
        try {
            final Document jdomDoc = domBuilder.build(document);
            return this.build(jdomDoc);
        } catch (final IllegalArgumentException ex) {
            throw ex;
        } catch (final Exception ex) {
            throw new ParsingFeedException("Invalid XML", ex);
        }
    }

    /**
     * Builds an WireFeed (RSS or Atom) from an JDOM document.
     * <p>
     * NOTE: All other build methods delegate to this method.
     * <p>
     *
     * @param document JDOM document to read to create the WireFeed.
     * @return the WireFeed read from the JDOM document.
     * @throws IllegalArgumentException thrown if feed type could not be understood by any of the
     *             underlying parsers.
     * @throws FeedException if the feed could not be parsed
     *
     */
    public WireFeed build(final Document document) throws IllegalArgumentException, FeedException {
        final WireFeedParser parser = getFeedParsers().getParserFor(document);
        if (parser == null) {
            throw new IllegalArgumentException("Invalid document");
        }
        return parser.parse(document, validate, locale);
    }

    /**
     * Creates and sets up a org.jdom2.input.SAXBuilder for parsing.
     *
     * @return a new org.jdom2.input.SAXBuilder object
     */
    protected SAXBuilder createSAXBuilder() {
        SAXBuilder saxBuilder;
        if (validate) {
            saxBuilder = new SAXBuilder(XMLReaders.DTDVALIDATING);
        } else {
            saxBuilder = new SAXBuilder(XMLReaders.NONVALIDATING);
        }
        saxBuilder.setEntityResolver(RESOLVER);

        //
        // This code is needed to fix the security problem outlined in
        // http://www.securityfocus.com/archive/1/297714
        //
        // Unfortunately there isn't an easy way to check if an XML parser
        // supports a particular feature, so
        // we need to set it and catch the exception if it fails. We also need
        // to subclass the JDom SAXBuilder
        // class in order to get access to the underlying SAX parser - otherwise
        // the features don't get set until
        // we are already building the document, by which time it's too late to
        // fix the problem.
        //
        // Crimson is one parser which is known not to support these features.
        try {
            
            final XMLReader parser = saxBuilder.createParser();
            
            setFeature(saxBuilder, parser, "http://xml.org/sax/features/external-general-entities", false);
            setFeature(saxBuilder, parser, "http://xml.org/sax/features/external-parameter-entities", false);
            setFeature(saxBuilder, parser, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

            if(!allowDoctypes) {
                setFeature(saxBuilder, parser, "http://apache.org/xml/features/disallow-doctype-decl", true);
            }

        } catch (final JDOMException e) {
            throw new IllegalStateException("JDOM could not create a SAX parser", e);
        }
        
        saxBuilder.setExpandEntities(false);

        return saxBuilder;

    }
    
    private void setFeature(SAXBuilder saxBuilder, XMLReader parser, String feature, boolean value) {
        if (isFeatureSupported(parser, feature, value)) {
            saxBuilder.setFeature(feature, value);
        }
    }

    private boolean isFeatureSupported(XMLReader parser, String feature, boolean value) {
        try {
            parser.setFeature(feature, value);
            return true;
        } catch (final SAXNotRecognizedException e) {
            return false;
        } catch (final SAXNotSupportedException e) {
            return false;
        }
    }

}