XWPFFeatureExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml.xwpf;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * This is designed to extract features that are useful for forensics, e-discovery and digital preservation.
 * Specifically, the presence of: tracked changes, hidden text, comments and comment authors. Because several of these
 * features can be placed on run properties, which can be in lots of places, we're scraping
 * the document xml
 */
public class XWPFFeatureExtractor {

    public void process(XWPFDocument xwpfDocument, Metadata metadata, ParseContext parseContext) {
        try (InputStream is = xwpfDocument.getPackagePart()
                                          .getInputStream()) {
            FeatureHandler featureHandler = new FeatureHandler();
            XMLReaderUtils.parseSAX(is, featureHandler, parseContext);
            if (featureHandler.hasComments) {
                metadata.set(Office.HAS_COMMENTS, true);
            }
            if (featureHandler.hasHidden) {
                metadata.set(Office.HAS_HIDDEN_TEXT, true);
            }
            if (featureHandler.hasTrackChanges) {
                metadata.set(Office.HAS_TRACK_CHANGES, true);
            }
            if (! featureHandler.authors.isEmpty()) {
                for (String author : featureHandler.authors) {
                    metadata.add(Office.COMMENT_PERSONS, author);
                }
            }
        } catch (IOException | TikaException | SAXException e) {
            //swallow
        }
    }

    private static class FeatureHandler extends DefaultHandler {
        //see: https://www.ericwhite.com/blog/using-xml-dom-to-detect-tracked-revisions-in-an-open-xml-wordprocessingml-document/
        private static final Set<String> TRACK_CHANGES = Set.of("ins", "del", "moveFrom", "moveTo");
        private final Set<String> authors = new HashSet<>();
        private boolean hasHidden = false;
        private boolean hasTrackChanges = false;
        private boolean hasComments = false;

        @Override
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            //we could check to ensure that the vanish element actually surrounds text
            //the current check could lead to false positives where <w:vanish/> is around a space or no text.
            if ("vanish".equals(localName)) {
                hasHidden = true;
            } else if (TRACK_CHANGES.contains(localName)) {
                String trackChangesAuthor = XMLReaderUtils.getAttrValue("author", atts);
                if (!StringUtils.isBlank(trackChangesAuthor)) {
                    authors.add(trackChangesAuthor);
                }
                hasTrackChanges = true;
            } else if ("commentReference".equals(localName) || "commentRangeStart".equals(localName)) {
                hasComments = true;
            }
        }
    }
}