HtmlHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;

class HtmlHandler extends TextContentHandler {

    // List of attributes that need to be resolved.
    private static final Set<String> URI_ATTRIBUTES =
            new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite"));
    private static final Pattern ICBM =
            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");

    private static final Map<String, Property> META_HEADER_MAPPINGS = new HashMap<>();

    static {
        META_HEADER_MAPPINGS.put("author", TikaCoreProperties.CREATOR);
        META_HEADER_MAPPINGS.put("title", TikaCoreProperties.TITLE);
        META_HEADER_MAPPINGS.put("subject", TikaCoreProperties.SUBJECT);
        META_HEADER_MAPPINGS.put("keywords", Office.KEYWORDS);
        META_HEADER_MAPPINGS.put("description", TikaCoreProperties.DESCRIPTION);
    }
    private static final Attributes EMPTY_ATTS = new AttributesImpl();
    private final HtmlMapper mapper;
    private final XHTMLContentHandler xhtml;
    private final Metadata metadata;
    private final ParseContext context;
    private final boolean extractScripts;
    private final StringBuilder title = new StringBuilder();
    private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
    private final StringBuilder script = new StringBuilder();
    private int bodyLevel = 0;
    private int discardLevel = 0;
    private int titleLevel = 0;
    private int scriptLevel = 0;
    private Attributes scriptAtts = EMPTY_ATTS;//attributes from outermost script element
    private boolean isTitleSetToMetadata = false;

    private HtmlHandler(HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata,
                        ParseContext context, boolean extractScripts) {
        super(xhtml);
        this.mapper = mapper;
        this.xhtml = xhtml;
        this.metadata = metadata;
        this.context = context;
        this.extractScripts = extractScripts;
        // Try to determine the default base URL, if one has not been given
        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
            if (name != null) {
                name = name.trim();
                try {
                    new URL(name); // test URL format
                    metadata.set(Metadata.CONTENT_LOCATION, name);
                } catch (MalformedURLException e) {
                    // The resource name is not a valid URL, ignore it
                }
            }
        }
    }

    public HtmlHandler(HtmlMapper mapper, ContentHandler handler, Metadata metadata,
                       ParseContext context, boolean extractScripts) {
        this(mapper, new XHTMLContentHandler(handler, metadata, context), metadata, context, extractScripts);
    }

    @Override
    public void startElement(String uri, String local, String name, Attributes atts)
            throws SAXException {

        if ("HTML".equals(name) && atts.getValue("lang") != null) {
            metadata.set(Metadata.CONTENT_LANGUAGE, atts.getValue("lang"));
        }
        if ("SCRIPT".equals(name)) {
            scriptLevel++;
        }
        if ("TITLE".equals(name) || titleLevel > 0) {
            titleLevel++;
        }
        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
            bodyLevel++;
        }
        if (mapper.isDiscardElement(name) || discardLevel > 0) {
            discardLevel++;
        }

        if (bodyLevel == 0 && discardLevel == 0) {
            if ("META".equals(name) && atts.getValue("content") != null) {
                // TIKA-478: For cases where we have either a name or
                // "http-equiv", assume that XHTMLContentHandler will emit
                // these in the <head>, thus passing them through safely.
                if (atts.getValue("http-equiv") != null) {
                    addHtmlMetadata(atts.getValue("http-equiv"), atts.getValue("content"));
                } else if (atts.getValue("name") != null) {
                    // Record the meta tag in the metadata
                    addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
                } else if (atts.getValue("property") != null) {
                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
                    metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content"));
                }
            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
                startElementWithSafeAttributes("base", atts);
                xhtml.endElement("base");
                metadata.set(Metadata.CONTENT_LOCATION, resolve(atts.getValue("href")));
            } else if ("LINK".equals(name)) {
                startElementWithSafeAttributes("link", atts);
                xhtml.endElement("link");
            } else if ("SCRIPT".equals(name)) {
                scriptAtts = atts;
            }
        }

        if (bodyLevel > 0 && discardLevel == 0) {
            String safe = mapper.mapSafeElement(name);
            if (safe != null) {
                startElementWithSafeAttributes(safe, atts);
            }
        }

        title.setLength(0);
        String value = atts.getValue("src");
        if (value != null && value.startsWith("data:")) {
            //don't extract data if we're in a script
            //and the user doesn't want to extract scripts
            if (scriptLevel == 0 || extractScripts) {
                handleDataURIScheme(value);
            }
        }
        if ("IFRAME".equals(name)) {
            String srcDoc = atts.getValue("srcdoc");
            if (!StringUtils.isBlank(srcDoc)) {
                handleSrcDoc(srcDoc);
            }
        }
    }

    /**
     * Adds a metadata setting from the HTML <head/> to the Tika metadata
     * object. The name and value are normalized where possible.
     */
    private void addHtmlMetadata(String name, String value) {
        //note that "name" derives from attributes and is not uppercased
        //like the elements by the XHTMLDowngradeHandler

        if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) {
            return;
        }

        if (name.equalsIgnoreCase("ICBM")) {
            Matcher m = ICBM.matcher(value);
            if (m.matches()) {
                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
                metadata.set(Metadata.LATITUDE, m.group(1));
                metadata.set(Metadata.LONGITUDE, m.group(2));
            } else {
                metadata.set("ICBM", value);
            }
            return;
        }

        if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
            //don't overwrite Metadata.CONTENT_TYPE!
            MediaType type = MediaType.parse(value);
            if (type != null) {
                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
            } else {
                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
            }
            return;
        }

        String lcName = name.toLowerCase(Locale.US);
        if (META_HEADER_MAPPINGS.containsKey(lcName)) {
            Property property = META_HEADER_MAPPINGS.get(lcName);
            if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) {
                //prefer the title element if it is already set
                //do nothing
                metadata.add(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName(), value);
            } else if (property.isMultiValuePermitted()) {
                metadata.add(property, value);
            } else {
                metadata.set(property, value);
            }
        } else {
            metadata.add(HTML.PREFIX_HTML_META + name, value);
        }
    }

    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
        if (atts.getLength() == 0) {
            xhtml.startElement(name);
            return;
        }

        boolean isObject = name.equals("object");
        String codebase = null;
        if (isObject) {
            codebase = atts.getValue("", "codebase");
            if (codebase != null) {
                codebase = resolve(codebase);
            } else {
                codebase = metadata.get(Metadata.CONTENT_LOCATION);
            }
        }

        AttributesImpl newAttributes = new AttributesImpl(atts);
        for (int att = 0; att < newAttributes.getLength(); att++) {
            String attrName = newAttributes.getLocalName(att);
            String normAttrName = mapper.mapSafeAttribute(name, attrName);
            if (normAttrName == null) {
                newAttributes.removeAttribute(att);
                att--;
            } else {
                // We have a remapped attribute name, so set it as it might have changed.
                newAttributes.setLocalName(att, normAttrName);

                // And resolve relative links. Eventually this should be pushed
                // into the HtmlMapper code.
                if (URI_ATTRIBUTES.contains(normAttrName)) {
                    //if this is a src="data: " element,
                    //we've handled that as an embedded file, don't include the full thing
                    //here
                    if (normAttrName.equals("src")) {
                        String v = newAttributes.getValue(att);
                        if (v.startsWith("data:")) {
                            newAttributes.setValue(att, "data:");
                        }
                    }
                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
                } else if (isObject && "codebase".equals(normAttrName)) {
                    newAttributes.setValue(att, codebase);
                } else if (isObject &&
                        ("data".equals(normAttrName) || "classid".equals(normAttrName))) {
                    newAttributes.setValue(att, resolve(codebase, newAttributes.getValue(att)));
                }
            }
        }

        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
        }

        xhtml.startElement(name, newAttributes);
    }

    @Override
    public void endElement(String uri, String local, String name) throws SAXException {
        if ("SCRIPT".equals(name)) {
            scriptLevel--;
            if (scriptLevel == 0) {
                if (scriptAtts.getLength() > 0) {
                    startElementWithSafeAttributes("script", scriptAtts);
                    xhtml.endElement("script");
                }
                scriptAtts = EMPTY_ATTS;
                if (extractScripts) {
                    writeScript();
                }
            }
        }

        if (bodyLevel > 0 && discardLevel == 0) {
            String safe = mapper.mapSafeElement(name);
            if (safe != null) {
                xhtml.endElement(safe);
            } else if (XHTMLContentHandler.ENDLINE.contains(name.toLowerCase(Locale.ENGLISH))) {
                // TIKA-343: Replace closing block tags (and <br/>) with a
                // newline unless the HtmlMapper above has already mapped
                // them to something else
                xhtml.newline();
            }
        }

        if (titleLevel > 0) {
            titleLevel--;
            if (titleLevel == 0 && !isTitleSetToMetadata) {
                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
                isTitleSetToMetadata = true;
            }
        }
        if (bodyLevel > 0) {
            bodyLevel--;
        }
        if (discardLevel > 0) {
            discardLevel--;
        }
    }
    private void handleSrcDoc(String string) throws SAXException {
        Metadata m = Metadata.newInstance(context);
        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
        m.set(Metadata.CONTENT_TYPE, "text/html");
        m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html");
        //TODO add metadata about iframe content?
        EmbeddedDocumentExtractor embeddedDocumentExtractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
            try (TikaInputStream tis = TikaInputStream.get(string.getBytes(StandardCharsets.UTF_8))) {
                embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true);
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            }
        }
    }

    private void handleDataURIScheme(String string) throws SAXException {
        DataURIScheme dataURIScheme;
        try {
            dataURIScheme = dataURISchemeUtil.parse(string);
        } catch (DataURISchemeParseException e) {
            //swallow
            return;
        }

        //do anything with attrs?
        Metadata m = Metadata.newInstance(context);
        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
        if (dataURIScheme.getMediaType() != null) {
            m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString());
        }
        EmbeddedDocumentExtractor embeddedDocumentExtractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
            try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) {
                embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true);
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            }
        }
    }

    private void writeScript() throws SAXException {
        //don't write an attached macro if there is no content
        //we may want to revisit this behavior
        if (script.toString().isBlank()) {
            return;
        }
        //do anything with attrs?
        Metadata m = Metadata.newInstance(context);
        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
        String src = scriptAtts.getValue("src");
        if (src != null) {
            m.set(HTML.SCRIPT_SOURCE, src);
        }

        EmbeddedDocumentExtractor embeddedDocumentExtractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        //try to scrape dataURISchemes from javascript
        List<DataURIScheme> dataURISchemes = dataURISchemeUtil.extract(script.toString());
        for (DataURIScheme dataURIScheme : dataURISchemes) {
            Metadata dataUriMetadata = Metadata.newInstance(context);
            dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            dataUriMetadata.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString());
            if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
                try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) {
                    embeddedDocumentExtractor
                            .parseEmbedded(tis, xhtml, dataUriMetadata, context, true);
                } catch (IOException e) {
                    //swallow
                }
            }
        }

        try (TikaInputStream tis = TikaInputStream.get(script.toString().getBytes(StandardCharsets.UTF_8))) {
            embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true);
        } catch (IOException e) {
            //shouldn't ever happen
        } finally {
            script.setLength(0);
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (scriptLevel > 0 && extractScripts) {
            script.append(ch, start, length);
        }
        if (titleLevel > 0 && bodyLevel == 0) {
            title.append(ch, start, length);
        }
        if (bodyLevel > 0 && discardLevel == 0) {
            super.characters(ch, start, length);
        }

    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        if (bodyLevel > 0 && discardLevel == 0) {
            super.ignorableWhitespace(ch, start, length);
        }
    }

    private String resolve(String url) {
        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
    }

    private String resolve(String base, String url) {
        url = url.trim();

        // Return the URL as-is if no base URL is available or if the URL
        // matches a common non-hierarchical or pseudo URI prefix
        String lower = url.toLowerCase(Locale.ENGLISH);
        if (base == null || lower.startsWith("urn:") || lower.startsWith("mailto:") ||
                lower.startsWith("tel:") || lower.startsWith("data:") ||
                lower.startsWith("javascript:") || lower.startsWith("about:")) {
            return url;
        }

        try {
            URL baseURL = new URL(base.trim());

            // We need to handle one special case, where the relativeUrl is
            // just a query string (like "?pid=1"), and the baseUrl doesn't
            // end with a '/'. In that case, the URL class removes the last
            // portion of the path, which we don't want.
            String path = baseURL.getPath();
            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
                return new URL(baseURL.getProtocol(), baseURL.getHost(), baseURL.getPort(),
                        baseURL.getPath() + url).toExternalForm();
            } else {
                return new URL(baseURL, url).toExternalForm();
            }
        } catch (MalformedURLException e) {
            // Unknown or broken format; just return the URL as received.
            return url;
        }
    }

}