UniversalEncodingDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.txt;

import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.List;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

@TikaComponent(spi = false, name = "universal-encoding-detector")
public class UniversalEncodingDetector implements EncodingDetector {

    private static final int BUFSIZE = 1024;

    private static final int DEFAULT_MARK_LIMIT = 16 * BUFSIZE;

    /**
     * Configuration class for JSON deserialization.
     */
    public static class Config implements Serializable {
        private int markLimit = DEFAULT_MARK_LIMIT;

        public int getMarkLimit() {
            return markLimit;
        }

        public void setMarkLimit(int markLimit) {
            this.markLimit = markLimit;
        }
    }

    private Config defaultConfig = new Config();

    /**
     * Default constructor for SPI loading.
     */
    public UniversalEncodingDetector() {
    }

    /**
     * Constructor with explicit Config object.
     *
     * @param config the configuration
     */
    public UniversalEncodingDetector(Config config) {
        this.defaultConfig = config;
    }

    /**
     * Constructor for JSON configuration.
     * Requires Jackson on the classpath.
     *
     * @param jsonConfig JSON configuration
     */
    public UniversalEncodingDetector(JsonConfig jsonConfig) {
        this(ConfigDeserializer.buildConfig(jsonConfig, Config.class));
    }

    public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                       ParseContext parseContext) throws IOException {
        if (tis == null) {
            return Collections.emptyList();
        }

        int markLimit = defaultConfig.getMarkLimit();
        tis.mark(markLimit);
        try {
            UniversalEncodingListener listener = new UniversalEncodingListener(metadata);

            byte[] b = new byte[BUFSIZE];
            int n = 0;
            int m = tis.read(b);
            while (m != -1 && n < markLimit && !listener.isDone()) {
                n += m;
                listener.handleData(b, 0, m);
                m = tis.read(b, 0, Math.min(b.length, markLimit - n));
            }

            Charset detected = listener.dataEnd();
            if (detected == null) {
                return Collections.emptyList();
            }
            // juniversalchardet doesn't expose per-result confidence;
            // use a mid-range value to allow arbitration to override when needed
            return List.of(new EncodingResult(detected, 0.5f));
        } catch (LinkageError e) {
            return Collections.emptyList(); // juniversalchardet is not available
        } finally {
            tis.reset();
        }
    }

    public Config getDefaultConfig() {
        return defaultConfig;
    }
}