AbstractOOXMLExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;

import static org.apache.tika.sax.XHTMLContentHandler.XHTML;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * Base class for all Tika OOXML extractors.
 * <p>
 * Tika extractors decorate POI extractors so that the parsed content of
 * documents is returned as a sequence of XHTML SAX events. Subclasses must
 * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
 * populates the {@link XHTMLContentHandler} object received as parameter.
 */
public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {


    static final String RELATION_AUDIO =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
    static final String RELATION_MEDIA =
            "http://schemas.microsoft.com/office/2007/relationships/media";
    static final String RELATION_VIDEO =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video";
    static final String RELATION_DIAGRAM_DATA =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";

    static final String RELATION_ALTERNATE_FORMAT_CHUNK =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk";

    protected static final String[] EMBEDDED_RELATIONSHIPS =
            new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART,
                    POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
                    RELATION_DIAGRAM_DATA};
    private static final String TYPE_OLE_OBJECT =
            "application/vnd.openxmlformats-officedocument.oleObject";


    private final EmbeddedDocumentExtractor embeddedExtractor;
    private final ParseContext context;
    protected OfficeParserConfig config;
    protected POIXMLTextExtractor extractor;

    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
        this.context = context;
        this.extractor = extractor;
        embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        this.config = context.get(OfficeParserConfig.class);
    }

    /**
     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
     */
    public POIXMLDocument getDocument() {
        return (POIXMLDocument) extractor.getDocument();
    }

    /**
     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
     */
    public MetadataExtractor getMetadataExtractor() {
        return new MetadataExtractor(extractor);
    }

    ParseContext getParseContext() {
        return context;
    }
    /**
     * @see
     * org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata,
     * ParseContext)
     */
    public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
            throws SAXException, XmlException, IOException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        buildXHTML(xhtml);

        // Now do any embedded parts
        handleEmbeddedParts(xhtml, metadata, getEmbeddedPartMetadataMap());

        // thumbnail
        handleThumbnail(xhtml, metadata);

        xhtml.endDocument();
    }

    protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() {
        return Collections.emptyMap();
    }

    protected String getJustFileName(String desc) {
        int idx = desc.lastIndexOf('/');
        if (idx != -1) {
            desc = desc.substring(idx + 1);
        }
        idx = desc.lastIndexOf('.');
        if (idx != -1) {
            desc = desc.substring(0, idx);
        }

        return desc;
    }

    private void handleThumbnail(ContentHandler handler, Metadata metadata) throws SAXException {
        try {
            OPCPackage opcPackage = extractor.getPackage();
            for (PackageRelationship rel : opcPackage
                    .getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
                PackagePart tPart = opcPackage.getPart(rel);
                if (tPart == null) {
                    continue;
                }
                try (InputStream tStream = tPart.getInputStream()) {
                    Metadata thumbnailMetadata = Metadata.newInstance(context);
                    String thumbName = tPart.getPartName().getName();
                    thumbnailMetadata.set(TikaCoreProperties.INTERNAL_PATH, thumbName);
                    thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
                            FilenameUtils.getName(thumbName));

                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded");
                    attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
                    handler.startElement(XHTML, "div", "div", attributes);
                    handler.endElement(XHTML, "div", "div");

                    thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, thumbName);
                    thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType());
                    thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
                    thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                            TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.name());

                    if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                        try (TikaInputStream tis = TikaInputStream.get(tStream)) {
                            embeddedExtractor.parseEmbedded(tis,
                                    new EmbeddedContentHandler(handler), thumbnailMetadata, context, false);
                        }
                    }
                }
            }
        } catch (SecurityException e) {
            throw e;
        } catch (Exception ex) {
            WriteLimitReachedException.throwIfWriteLimitReached(ex);
            //swallow otherwise
            metadata.add(TikaCoreProperties.EMBEDDED_EXCEPTION,
                    ExceptionUtils.getStackTrace(ex));
        }
    }

    private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata metadata,
                                     Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap)
            throws TikaException, IOException, SAXException {
        //keep track of media items that have been handled
        //there can be multiple relationships pointing to the
        //same underlying media item.  We only want to process
        //the underlying media item once.
        Set<String> handledTarget = new HashSet<>();
        try {
            for (PackagePart source : getMainDocumentParts()) {
                if (source == null) {
                    //parts can go missing; silently ignore --  TIKA-2134
                    continue;
                }
                for (PackageRelationship rel : source.getRelationships()) {
                    try {
                        handleEmbeddedPart(source, rel, xhtml, metadata,
                                embeddedPartMetadataMap, handledTarget);
                    } catch (SAXException | SecurityException e) {
                        throw e;
                    } catch (Exception e) {
                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                    }
                }
            }
        } catch (InvalidFormatException e) {
            throw new TikaException("Broken OOXML file", e);
        }
    }

    private void handleEmbeddedPart(PackagePart source, PackageRelationship rel,
                                    XHTMLContentHandler xhtml, Metadata parentMetadata,
                                    Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap,
                                    Set<String> handledTarget)
            throws IOException, SAXException, TikaException, InvalidFormatException {
        URI targetURI = rel.getTargetURI();
        if (targetURI != null) {
            if (handledTarget.contains(targetURI.toString())) {
                return;
            }
        }

        URI sourceURI = rel.getSourceURI();
        String sourceDesc;
        if (sourceURI != null) {
            sourceDesc = getJustFileName(sourceURI.getPath());
            if (sourceDesc.startsWith("slide")) {
                sourceDesc += "_";
            } else {
                sourceDesc = "";
            }
        } else {
            sourceDesc = "";
        }
        if (rel.getTargetMode() != TargetMode.INTERNAL) {
            // External target - emit as external reference for security analysis
            String type = rel.getRelationshipType();
            if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
                emitExternalRef(xhtml, "externalOleObject", targetURI.toString());
                parentMetadata.set(Office.HAS_EXTERNAL_OLE_OBJECTS, true);
            } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) {
                emitExternalRef(xhtml, "externalImage", targetURI.toString());
            } else {
                emitExternalRef(xhtml, "externalResource", targetURI.toString());
            }
            return;
        }
        PackagePart target;

        try {
            target = source.getRelatedPart(rel);
        } catch (IllegalArgumentException ex) {
            return;
        }
        EmbeddedPartMetadata embeddedPartMetadata = embeddedPartMetadataMap.get(rel.getId());
        String type = rel.getRelationshipType();
        if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) &&
                TYPE_OLE_OBJECT.equals(target.getContentType())) {
            handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata,
                    embeddedPartMetadata);
            if (targetURI != null) {
                handledTarget.add(targetURI.toString());
            }
        } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) {
            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
                    embeddedPartMetadata, TikaCoreProperties.EmbeddedResourceType.INLINE);
            if (targetURI != null) {
                handledTarget.add(targetURI.toString());
            }
        } else if (RELATION_MEDIA.equals(type) || RELATION_VIDEO.equals(type) ||
                RELATION_AUDIO.equals(type) ||
                POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) ||
                POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
                    embeddedPartMetadata,
                    TikaCoreProperties.EmbeddedResourceType.ATTACHMENT);
            if (targetURI != null) {
                handledTarget.add(targetURI.toString());
            }
        } else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) {
            handleMacros(target, xhtml);
            if (targetURI != null) {
                handledTarget.add(targetURI.toString());
            }
        } else if (RELATION_ALTERNATE_FORMAT_CHUNK.equals(type)) {
            //TODO check for targetMode=INTERNAL?
            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
                    embeddedPartMetadata,
                    TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK);
            if (targetURI != null) {
                handledTarget.add(targetURI.toString());
            }
        }
    }


    /**
     * Handles an embedded OLE object in the document
     */
    private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler xhtml, String rel,
                                   Metadata parentMetadata,
                                   EmbeddedPartMetadata embeddedPartMetadata) throws IOException,
            SAXException, TikaException {
        // A POIFSFileSystem needs to be at least 3 blocks big to be valid
        if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
            // Too small, skip
            return;
        }

        // Open the POIFS (OLE2) structure and process
        POIFSFileSystem fs;
        try {
            fs = new POIFSFileSystem(part.getInputStream());
        } catch (Exception e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
            return;
        }
        TikaInputStream tis = null;
        try {
            Metadata metadata = Metadata.newInstance(context);
            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
            metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
            metadata.set(TikaCoreProperties.INTERNAL_PATH, part.getPartName().getName());

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);

            String packageEntryName = getPackageEntryName(root);
            try {
                SummaryExtractor summaryExtractor = new SummaryExtractor(metadata);
                summaryExtractor.parseSummaries(root);
            } catch (TikaException e) {
                //swallow -- things happened
            }
            if (packageEntryName != null) {
                //OLE 2.0
                updateMetadata(metadata, embeddedPartMetadata);

                tis = TikaInputStream.get(fs.createDocumentInputStream(packageEntryName));
                if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                    embeddedExtractor
                            .parseEmbedded(tis, xhtml, metadata, context, true);
                }
            } else if (POIFSDocumentType.OLE10_NATIVE == type) {
                // TIKA-704: OLE 1.0 embedded document
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
                if (ole.getLabel() != null) {
                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, ole.getLabel());
                }
                if (ole.getCommand() != null) {
                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
                }
                if (ole.getFileName() != null) {
                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
                }
                byte[] data = ole.getDataBuffer();
                if (data != null) {
                    tis = TikaInputStream.get(data);
                }

                if (tis != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                    embeddedExtractor
                            .parseEmbedded(tis, xhtml, metadata, context, true);
                }
            } else {
                handleEmbeddedFile(part, xhtml, rel, embeddedPartMetadata,
                        TikaCoreProperties.EmbeddedResourceType.ATTACHMENT);
            }
        } catch (FileNotFoundException e) {
            // There was no CONTENTS entry, so skip this part
        } catch (Ole10NativeException e) {
            // Could not process an OLE 1.0 entry, so skip this part
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
        } finally {
            fs.close();
            if (tis != null) {
                tis.close();
            }
        }
    }

    private void updateMetadata(Metadata metadata, EmbeddedPartMetadata embeddedPartMetadata) {
        if (embeddedPartMetadata == null) {
            return;
        }
        if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) {
            metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId());
        }
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, embeddedPartMetadata.getFullName());
    }

    private String getPackageEntryName(DirectoryNode root) {
        if (root.hasEntry("\u0001Ole")) {
            //we used to require this too: root.hasEntry("\u0001CompObj") before TIKA-3526
            if (root.hasEntry("Package")) {
                return "Package";
            } else if (root.hasEntry("CONTENTS")) {
                return "CONTENTS";
            } else if (root.hasEntry("package")) {
                return "package";
            }
        }
        if (root.hasEntry("package")) {
            return "package";
        }
        /*
            raw CorelDraw stream may be in an ole bundle
            but there can be other resources for the image
            in other streams under root...think about this...
            see: AZG2X4VXB3KIEDT3OVZC4R645KU5VSOF
        if (root.hasEntry("CorelDRAW")) {
            return "CorelDRAW";
        }*/
        return null;
    }

    /**
     * Handles an embedded file in the document
     */
    protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler xhtml,
                                      String rel,
                                      EmbeddedPartMetadata embeddedPartMetadata,
                                      TikaCoreProperties.EmbeddedResourceType embeddedResourceType)
            throws SAXException, IOException, TikaException {
        Metadata metadata = Metadata.newInstance(context);
        metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                embeddedResourceType.name());
        metadata.set(TikaCoreProperties.INTERNAL_PATH, part.getPartName().getName());

        // Get the name
        updateResourceName(part, embeddedPartMetadata, metadata);

        // Get the content type
        metadata.set(Metadata.CONTENT_TYPE, part.getContentType());

        // Call the recursing handler
        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
            try (TikaInputStream tis = TikaInputStream.get(part.getInputStream())) {
                embeddedExtractor
                        .parseEmbedded(tis, xhtml, metadata, context, true);
            }
        }
    }

    private void updateResourceName(PackagePart part, EmbeddedPartMetadata embeddedPartMetadata,
                                    Metadata metadata) {

        if (embeddedPartMetadata != null) {
            if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) {
                metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId());
            }
            String fullName = embeddedPartMetadata.getFullName();
            if (!StringUtils.isBlank(fullName)) {
                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullName);
                return;
            }
        }
        //TODO -- should we record the literal name of the embedded file?
        String name = part.getPartName().getName();
        int lastSlash = name.lastIndexOf('/');
        if (lastSlash > -1) {
            name = name.substring(lastSlash + 1);
        }
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
    }

    /**
     * Emits an external reference as an anchor element with appropriate class.
     * Used for detecting external resources that could be security risks.
     */
    private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url)
            throws SAXException {
        if (url == null || url.isEmpty()) {
            return;
        }
        AttributesImpl attrs = new AttributesImpl();
        attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType);
        attrs.addAttribute("", "href", "href", "CDATA", url);
        xhtml.startElement("a", attrs);
        xhtml.endElement("a");
    }

    /**
     * Populates the {@link XHTMLContentHandler} object received as parameter.
     */
    protected abstract void buildXHTML(XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException;

    /**
     * Return a list of the main parts of the document, used
     * when searching for embedded resources.
     * This should be all the parts of the document that end
     * up with things embedded into them.
     */
    protected abstract List<PackagePart> getMainDocumentParts() throws TikaException;


    void handleMacros(PackagePart macroPart, ContentHandler handler)
            throws TikaException, SAXException {
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);

        if (officeParserConfig.isExtractMacros()) {
            try (InputStream is = macroPart.getInputStream()) {
                try (POIFSFileSystem poifs = new POIFSFileSystem(is)) {
                    //Macro reading exceptions are already swallowed here
                    OfficeParser.extractMacros(poifs, handler, embeddedExtractor, context);
                }
            } catch (IOException e) {
                throw new TikaException("Broken OOXML file", e);
            }
        }
    }

    /**
     * This is used by the SAX docx and pptx decorators to load hyperlinks and
     * other linked objects
     *
     * @param bodyPart
     * @return
     */
    protected Map<String, String> loadLinkedRelationships(PackagePart bodyPart,
                                                          boolean includeInternal,
                                                          Metadata metadata) {
        Map<String, String> linkedRelationships = new HashMap<>();
        try {
            PackageRelationshipCollection prc =
                    bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
            for (int i = 0; i < prc.size(); i++) {
                PackageRelationship pr = prc.getRelationship(i);
                if (pr == null) {
                    continue;
                }
                if (!includeInternal && TargetMode.INTERNAL.equals(pr.getTargetMode())) {
                    continue;
                }
                String id = pr.getId();
                String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
                if (id != null && url != null) {
                    linkedRelationships.put(id, url);
                }
            }

            for (String rel : EMBEDDED_RELATIONSHIPS) {

                prc = bodyPart.getRelationshipsByType(rel);
                for (int i = 0; i < prc.size(); i++) {
                    PackageRelationship pr = prc.getRelationship(i);
                    if (pr == null) {
                        continue;
                    }
                    String id = pr.getId();
                    String uriString =
                            (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
                    String fileName = uriString;
                    if (pr.getTargetURI() != null) {
                        try {
                            fileName = FileHelper.getFilename(new File(fileName));
                        } catch (Exception e) {
                            fileName = uriString;
                        }
                    }
                    if (id != null) {
                        fileName = (fileName == null) ? "" : fileName;
                        linkedRelationships.put(id, fileName);
                    }
                }
            }

        } catch (InvalidFormatException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        }
        return linkedRelationships;
    }

    /**
     * This should handle the comments, master, notes, with the streaming "general docx/pptx
     * handler"
     *
     * @param contentType
     * @param xhtmlClassLabel
     * @param parentPart
     * @param contentHandler
     */
    void handleGeneralTextContainingPart(String contentType, String xhtmlClassLabel,
                                         PackagePart parentPart, Metadata parentMetadata,
                                         ContentHandler contentHandler) throws SAXException {

        PackageRelationshipCollection relatedPartPRC = null;

        try {
            relatedPartPRC = parentPart.getRelationshipsByType(contentType);
        } catch (InvalidFormatException e) {
            parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                    ExceptionUtils.getStackTrace(e));
        }
        if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
            AttributesImpl attributes = new AttributesImpl();

            attributes.addAttribute("", "class", "class",
                    "CDATA", xhtmlClassLabel);
            contentHandler.startElement("", "div", "div", attributes);
            for (int i = 0; i < relatedPartPRC.size(); i++) {
                PackageRelationship relatedPartPackageRelationship =
                        relatedPartPRC.getRelationship(i);
                try {
                    PackagePart relatedPartPart =
                            parentPart.getRelatedPart(relatedPartPackageRelationship);
                    try (InputStream stream = relatedPartPart.getInputStream()) {
                        XMLReaderUtils.parseSAX(stream,
                                new EmbeddedContentHandler(contentHandler), context);

                    } catch (IOException | TikaException e) {
                        parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                                ExceptionUtils.getStackTrace(e));
                    }
                } catch (InvalidFormatException e) {
                    parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                            ExceptionUtils.getStackTrace(e));
                }
            }
            contentHandler.endElement("", "div", "div");
        }

    }

}