RecursiveParserWrapperHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.utils.ParserUtils;

/**
 * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}.
 * See its documentation for more details.
 * <p>
 * This caches the a metadata object for each embedded file and for the container file.
 * It places the extracted content in the metadata object, with this key:
 * {@link TikaCoreProperties#TIKA_CONTENT}
 * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each
 * embedded document.
 * <p>
 * <b>NOTE: This handler must only be used with the {@link
 * org.apache.tika.parser.RecursiveParserWrapper}</b>
 * </p>
 */
public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {

    protected final List<Metadata> metadataList = new LinkedList<>();

    /**
     * Create a handler for recursive parsing.
     * <p>
     * Embedded resource limits are now configured via {@link org.apache.tika.config.EmbeddedLimits}
     * in the ParseContext, not on the handler.
     *
     * @param contentHandlerFactory factory for creating content handlers
     */
    public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
        super(contentHandlerFactory);
    }

    /**
     * This is called before parsing an embedded document
     *
     * @param contentHandler - local content handler to use on the embedded document
     * @param metadata       metadata to use for the embedded document
     * @throws SAXException
     */
    @Override
    public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
            throws SAXException {
        super.startEmbeddedDocument(contentHandler, metadata);
    }

    /**
     * This is called after parsing an embedded document.
     *
     * @param contentHandler local contenthandler used on the embedded document
     * @param metadata       metadata from the embedded document
     * @throws SAXException
     */
    @Override
    public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
            throws SAXException {
        super.endEmbeddedDocument(contentHandler, metadata);
        addContent(contentHandler, metadata);

        if (metadata.size() > 0) {
            metadataList.add(ParserUtils.cloneMetadata(metadata));
        }
    }

    /**
     * @param contentHandler content handler used on the main document
     * @param metadata       metadata from the main document
     * @throws SAXException
     */
    @Override
    public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
        super.endDocument(contentHandler, metadata);
        addContent(contentHandler, metadata);
        if (metadata.size() > 0) {
            metadataList.add(0, ParserUtils.cloneMetadata(metadata));
        }
        writeFinalEmbeddedPaths();
    }

    private void writeFinalEmbeddedPaths() {
        //for some file types, the file's "name" is not known before
        //their attachments are parsed. This goes through the id paths
        //and regenerates the path for the "final embedded resource path"
        Map<String, String> idToName = new HashMap<>();
        AtomicInteger unknownCount = new AtomicInteger(0);
        for (Metadata metadata : metadataList) {
            String id = metadata.get(TikaCoreProperties.EMBEDDED_ID);
            if (id == null) {
                continue;
            }
            String name = RecursiveParserWrapper.getResourceName(metadata, unknownCount);
            idToName.put(id, name);
        }
        for (Metadata metadata : metadataList) {
            String idPath = metadata.get(TikaCoreProperties.EMBEDDED_ID_PATH);
            if (idPath == null) {
                continue;
            }
            if (idPath.startsWith("/")) {
                idPath = idPath.substring(1);
            }
            String[] ids = idPath.split("/");
            StringBuilder sb = new StringBuilder();
            for (String id : ids) {
                sb.append("/").append(idToName.get(id));
            }
            metadata.set(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH, sb.toString());
        }
    }

    /**
     * @return a list of Metadata objects, one for the main document and one for each embedded
     * document
     */
    public List<Metadata> getMetadataList() {
        return metadataList;
    }

    void addContent(ContentHandler handler, Metadata metadata) {

        if (handler.getClass().equals(DefaultHandler.class)) {
            //no-op: we can't rely on just testing for
            //empty content because DefaultHandler's toString()
            //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
        } else {
            String content = handler.toString();
            if (content != null && !content.isBlank()) {
                metadata.add(TikaCoreProperties.TIKA_CONTENT, content);
                metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER,
                        handler.getClass().getSimpleName());
                metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
                        getContentHandlerFactory().handlerTypeName());
            }
        }
    }
}