JSoupParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.XMLConstants;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.parser.TagSet;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;


/**
 * HTML parser. Uses JSoup to turn the input document to HTML SAX events,
 * and post-processes the events to produce XHTML and metadata expected by
 * Tika clients.
 */
@TikaComponent(name = "jsoup-parser")
public class JSoupParser extends AbstractEncodingDetectorParser {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 7895315240498733128L;

    public static final Charset DEFAULT_CHARSET = StandardCharsets.US_ASCII;

    /**
     * Configuration class for JSON deserialization.
     */
    public static class Config implements Serializable {
        public boolean extractScripts = false;
    }

    private static final MediaType XHTML = MediaType.application("xhtml+xml");
    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
    private static final MediaType X_ASP = MediaType.application("x-asp");

    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));

    private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html();

    static {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(
                JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), StandardCharsets.UTF_8))) {
            String line = reader.readLine();
            while (line != null) {
                if (line.startsWith("#") || line.trim().isEmpty()) {
                    line = reader.readLine();
                    continue;
                }
                Tag t = SELF_CLOSEABLE_TAGS.valueOf(line.trim(), Parser.NamespaceHtml);
                t.set(Tag.SelfClose);
                line = reader.readLine();
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't find self-closeable-tags.txt");
        }
    }

    private boolean extractScripts = false;

    public JSoupParser() {
        super();
    }

    public JSoupParser(EncodingDetector encodingDetector) {
        super(encodingDetector);
    }

    /**
     * Constructor with explicit Config object.
     *
     * @param config the configuration
     */
    public JSoupParser(Config config) {
        super();
        this.extractScripts = config.extractScripts;
    }

    /**
     * Constructor for JSON configuration.
     * Requires Jackson on the classpath.
     *
     * @param jsonConfig JSON configuration
     */
    public JSoupParser(JsonConfig jsonConfig) {
        this(ConfigDeserializer.buildConfig(jsonConfig, Config.class));
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public boolean isExtractScripts() {
        return extractScripts;
    }

    /**
     * Whether or not to extract contents in script entities.
     * Default is <code>false</code>
     *
     * @param extractScripts
     */
    public void setExtractScripts(boolean extractScripts) {
        this.extractScripts = extractScripts;
    }


    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        EncodingDetector encodingDetector = getEncodingDetector(context);
        List<EncodingResult> encResults = encodingDetector.detect(tis, metadata, context);
        Charset charset = encResults.isEmpty() ? DEFAULT_CHARSET
                : encResults.get(0).getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper());

        TagSet tagSet = new TagSet(SELF_CLOSEABLE_TAGS);
        /* TODO -- when we upgrade jsoup to 1.21.1
                .onNewTag(tag -> {
            if (!tag.isKnownTag())
                tag.set(Tag.SelfClose);
        });
        */

        //do better with baseUri?
        tis.setCloseShield();
        Document document;
        try {
            document = Jsoup.parse(tis, charset.name(), "",
                    Parser.htmlParser().tagSet(tagSet));
        } finally {
            tis.removeCloseShield();
        }
        document.quirksMode(Document.QuirksMode.quirks);
        ContentHandler xhtml = new XHTMLDowngradeHandler(
                new HtmlHandler(mapper, handler, metadata, context, extractScripts));
        xhtml.startDocument();
        try {
            NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
        } catch (RuntimeSAXException e) {
            throw e.getWrapped();
        } finally {
            xhtml.endDocument();
        }
    }

    public void parseString(String html, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException {
        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper());

        //do better with baseUri?
        Document document = Jsoup.parse(html, Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
        document.quirksMode(Document.QuirksMode.quirks);
        ContentHandler xhtml = new XHTMLDowngradeHandler(
                new HtmlHandler(mapper, handler, metadata, context, extractScripts));
        xhtml.startDocument();
        try {
            NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
        } catch (RuntimeSAXException e) {
            throw e.getWrapped();
        } finally {
            xhtml.endDocument();
        }
    }

    private class TikaNodeFilter implements NodeFilter {
        ContentHandler handler;

        private TikaNodeFilter(ContentHandler handler) {
            this.handler = handler;
        }

        @Override
        public NodeFilter.FilterResult head(Node node, int i) {

            if (node instanceof TextNode) {
                String txt = ((TextNode) node).getWholeText();
                if (txt != null) {
                    char[] chars = txt.toCharArray();
                    try {
                        if (chars.length > 0) {
                            handler.characters(chars, 0, chars.length);
                        }
                    } catch (SAXException e) {
                        throw new RuntimeSAXException(e);
                    }
                }
                return FilterResult.CONTINUE;
            } else if (node instanceof DataNode) {
                //maybe handle script data directly here instead of
                //passing it through to the HTMLHandler?
                String txt = ((DataNode) node).getWholeData();
                if (txt != null) {
                    char[] chars = txt.toCharArray();
                    try {
                        if (chars.length > 0) {
                            handler.characters(chars, 0, chars.length);
                        }
                    } catch (SAXException e) {
                        throw new RuntimeSAXException(e);
                    }
                }
                return FilterResult.CONTINUE;
            }
            AttributesImpl attributes = new AttributesImpl();
            Iterator<Attribute> jsoupAttrs = node.attributes().iterator();
            while (jsoupAttrs.hasNext()) {
                Attribute jsoupAttr = jsoupAttrs.next();
                attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "",
                        jsoupAttr.getValue());
            }
            try {
                handler.startElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName(),
                        attributes);
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
            return FilterResult.CONTINUE;
        }

        @Override
        public NodeFilter.FilterResult tail(Node node, int i) {
            if (node instanceof TextNode || node instanceof DataNode) {
                return FilterResult.CONTINUE;
            }
            try {
                handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName());
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
            return FilterResult.CONTINUE;
        }
    }

    private static class RuntimeSAXException extends RuntimeException {
        private SAXException wrapped;

        private RuntimeSAXException(SAXException e) {
            this.wrapped = e;
        }

        SAXException getWrapped() {
            return wrapped;
        }
    }

    /**
     * Look for an EncodingDetetor in the ParseContext.  If it hasn't been
     * passed in, use the original EncodingDetector from initialization.
     *
     * @param parseContext
     * @return
     */
    protected EncodingDetector getEncodingDetector(ParseContext parseContext) {

        EncodingDetector fromParseContext = parseContext.get(EncodingDetector.class);
        if (fromParseContext != null) {
            return fromParseContext;
        }

        return getEncodingDetector();
    }

}