RecursiveParserWrapperHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.utils.ParserUtils;
/**
* This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}.
* See its documentation for more details.
* <p>
* This caches the a metadata object for each embedded file and for the container file.
* It places the extracted content in the metadata object, with this key:
* {@link TikaCoreProperties#TIKA_CONTENT}
* If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each
* embedded document.
* <p>
* <b>NOTE: This handler must only be used with the {@link
* org.apache.tika.parser.RecursiveParserWrapper}</b>
* </p>
*/
public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {
protected final List<Metadata> metadataList = new LinkedList<>();
/**
* Create a handler for recursive parsing.
* <p>
* Embedded resource limits are now configured via {@link org.apache.tika.config.EmbeddedLimits}
* in the ParseContext, not on the handler.
*
* @param contentHandlerFactory factory for creating content handlers
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
super(contentHandlerFactory);
}
/**
* This is called before parsing an embedded document
*
* @param contentHandler - local content handler to use on the embedded document
* @param metadata metadata to use for the embedded document
* @throws SAXException
*/
@Override
public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
super.startEmbeddedDocument(contentHandler, metadata);
}
/**
* This is called after parsing an embedded document.
*
* @param contentHandler local contenthandler used on the embedded document
* @param metadata metadata from the embedded document
* @throws SAXException
*/
@Override
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
super.endEmbeddedDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
if (metadata.size() > 0) {
metadataList.add(ParserUtils.cloneMetadata(metadata));
}
}
/**
* @param contentHandler content handler used on the main document
* @param metadata metadata from the main document
* @throws SAXException
*/
@Override
public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
super.endDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
if (metadata.size() > 0) {
metadataList.add(0, ParserUtils.cloneMetadata(metadata));
}
writeFinalEmbeddedPaths();
}
private void writeFinalEmbeddedPaths() {
//for some file types, the file's "name" is not known before
//their attachments are parsed. This goes through the id paths
//and regenerates the path for the "final embedded resource path"
Map<String, String> idToName = new HashMap<>();
AtomicInteger unknownCount = new AtomicInteger(0);
for (Metadata metadata : metadataList) {
String id = metadata.get(TikaCoreProperties.EMBEDDED_ID);
if (id == null) {
continue;
}
String name = RecursiveParserWrapper.getResourceName(metadata, unknownCount);
idToName.put(id, name);
}
for (Metadata metadata : metadataList) {
String idPath = metadata.get(TikaCoreProperties.EMBEDDED_ID_PATH);
if (idPath == null) {
continue;
}
if (idPath.startsWith("/")) {
idPath = idPath.substring(1);
}
String[] ids = idPath.split("/");
StringBuilder sb = new StringBuilder();
for (String id : ids) {
sb.append("/").append(idToName.get(id));
}
metadata.set(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH, sb.toString());
}
}
/**
* @return a list of Metadata objects, one for the main document and one for each embedded
* document
*/
public List<Metadata> getMetadataList() {
return metadataList;
}
void addContent(ContentHandler handler, Metadata metadata) {
if (handler.getClass().equals(DefaultHandler.class)) {
//no-op: we can't rely on just testing for
//empty content because DefaultHandler's toString()
//returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
} else {
String content = handler.toString();
if (content != null && !content.isBlank()) {
metadata.add(TikaCoreProperties.TIKA_CONTENT, content);
metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER,
handler.getClass().getSimpleName());
metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
getContentHandlerFactory().handlerTypeName());
}
}
}
}