AutoDetectReader.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.List;

import org.xml.sax.InputSource;

import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.CharsetUtils;

/**
 * An input stream reader that automatically detects the character encoding
 * to be used for converting bytes to characters.
 *
 * @since Apache Tika 1.2
 */
public class AutoDetectReader extends BufferedReader {

    private static final ServiceLoader DEFAULT_LOADER =
            new ServiceLoader(AutoDetectReader.class.getClassLoader());

    private static final EncodingDetector DEFAULT_DETECTOR;

    static {
        DEFAULT_DETECTOR = new CompositeEncodingDetector(
                DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
    }

    private final Charset charset;

    private AutoDetectReader(InputStream stream, Charset charset) throws IOException {
        super(new InputStreamReader(stream, charset));
        this.charset = charset;

        // TIKA-240: Drop the BOM if present
        mark(1);
        if (read() != '\ufeff') { // zero-width no-break space
            reset();
        }
    }

    public AutoDetectReader(InputStream stream, Metadata metadata,
                            EncodingDetector encodingDetector) throws IOException, TikaException {
        // IMPORTANT: Only call getTikaInputStream once, then reuse the same instance.
        // Calling it twice creates two different TikaInputStreams sharing the same underlying
        // stream, causing the second one's reads to advance the position for both.
        this(getTikaInputStream(stream), metadata, encodingDetector);
    }

    private AutoDetectReader(TikaInputStream tis, Metadata metadata,
                             EncodingDetector encodingDetector) throws IOException, TikaException {
        this(tis, detect(tis, metadata, encodingDetector));
    }

    public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
            throws IOException, TikaException {
        this(getTikaInputStream(stream), metadata,
                new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)));
    }

    public AutoDetectReader(InputStream stream, Metadata metadata)
            throws IOException, TikaException {
        this(stream, metadata, DEFAULT_DETECTOR);
    }

    public AutoDetectReader(InputStream stream) throws IOException, TikaException {
        this(stream, new Metadata());
    }

    private static Charset detect(TikaInputStream tis, Metadata metadata,
                                  EncodingDetector detector)
            throws IOException, TikaException {
        // Ask all given detectors for the character encoding
        List<EncodingResult> results = detector.detect(tis, metadata, new ParseContext());
        if (!results.isEmpty()) {
            return results.get(0).getCharset();
        }
        Charset charset = null;

        // Try determining the encoding based on hints in document metadata
        MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
        if (type != null) {
            String charsetParam = type.getParameters().get("charset");
            if (charsetParam != null) {
                try {
                    Charset cs = CharsetUtils.forName(charsetParam);
                    metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
                            "AutoDetectReader-charset-metadata-fallback");
                    return cs;
                } catch (IllegalArgumentException e) {
                    // ignore
                }
            }
        }

        throw new TikaException("Failed to detect the character encoding of a document");
    }

    private static TikaInputStream getTikaInputStream(InputStream stream) {
        if (stream instanceof TikaInputStream) {
            return (TikaInputStream) stream;
        }
        return TikaInputStream.get(stream);
    }


    public Charset getCharset() {
        return charset;
    }

    public InputSource asInputSource() {
        InputSource source = new InputSource(this);
        source.setEncoding(charset.name());
        return source;
    }

}