InputStreamDigester.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.digest;

import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.security.Provider;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.StringUtils;

/**
 * Digester that uses {@link TikaInputStream#enableRewind()} and {@link TikaInputStream#rewind()}
 * to read the entire stream for digesting, then rewind for subsequent processing.
 */
public class InputStreamDigester implements Digester {

    private final String algorithm;
    private final String metadataKey;
    private final Encoder encoder;

    /**
     * @param algorithm   name of the digest algorithm to retrieve from the Provider
     * @param metadataKey the full metadata key to use when storing the digest
     *                    (e.g., "X-TIKA:digest:MD5" or "X-TIKA:digest:SHA256:BASE32")
     * @param encoder     encoder to convert the byte array returned from the digester to a
     *                    string
     */
    public InputStreamDigester(String algorithm, String metadataKey, Encoder encoder) {
        this.algorithm = algorithm;
        this.metadataKey = metadataKey;
        this.encoder = encoder;
    }

    private static void setContentLength(long length, Metadata metadata) {
        if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
            //only add it if it hasn't been populated already
            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
        }
    }

    private MessageDigest newMessageDigest() {
        try {
            Provider provider = getProvider();
            if (provider == null) {
                return MessageDigest.getInstance(algorithm);
            } else {
                return MessageDigest.getInstance(algorithm, provider);
            }
        } catch (NoSuchAlgorithmException e) {
            throw new IllegalArgumentException(e);
        }
    }

    /**
     * When subclassing this, becare to ensure that your provider is
     * thread-safe (not likely) or return a new provider with each call.
     *
     * @return provider to use to get the MessageDigest from the algorithm name.
     * Default is to return null.
     */
    protected Provider getProvider() {
        return null;
    }

    /**
     * Digests the TikaInputStream and stores the result in metadata.
     * <p>
     * Uses {@link TikaInputStream#enableRewind()} to ensure the stream can be
     * rewound after digesting, then calls {@link TikaInputStream#rewind()} to
     * reset the stream for subsequent processing.
     *
     * @param tis          TikaInputStream to digest
     * @param metadata     metadata in which to store the digest information
     * @param parseContext ParseContext -- not actually used yet, but there for future expansion
     * @throws IOException on IO problem or IllegalArgumentException if algorithm couldn't be found
     */
    @Override
    public void digest(TikaInputStream tis, Metadata metadata, ParseContext parseContext)
            throws IOException {
        tis.enableRewind();

        MessageDigest messageDigest = newMessageDigest();
        byte[] buffer = new byte[8192];
        long total = 0;
        int read;
        while ((read = tis.read(buffer)) != -1) {
            messageDigest.update(buffer, 0, read);
            total += read;
        }

        setContentLength(total, metadata);
        metadata.set(metadataKey, encoder.encode(messageDigest.digest()));

        tis.rewind();
    }

}