CharSoupEncodingDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetectorContext;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.MetaEncodingDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

/**
 * A {@link MetaEncodingDetector} that uses the CharSoup language detector
 * to arbitrate when base encoding detectors disagree.
 *
 * <p>When base detectors all agree, the unanimous charset is returned
 * without any language detection. When they disagree, raw bytes are
 * read from the stream, decoded with each candidate charset, and each
 * decoded text is scored by {@link CharSoupLanguageDetector}. The
 * charset that produces the highest-confidence language detection wins.</p>
 *
 * <p>To enable, add this detector to your encoding detector chain in
 * tika-config:</p>
 * <pre>{@code
 * "encoding-detectors": [
 *   { "default-encoding-detector": {} },
 *   { "charsoup-encoding-detector": {} }
 * ]
 * }</pre>
 *
 * @since Apache Tika 3.2
 */
@TikaComponent(name = "charsoup-encoding-detector")
public class CharSoupEncodingDetector implements MetaEncodingDetector {

    private static final long serialVersionUID = 1L;

    private static final Logger LOG = LoggerFactory.getLogger(CharSoupEncodingDetector.class);

    private static final int DEFAULT_READ_LIMIT = 16384;

    private static final String GLM_RESOURCE = GenerativeLanguageModel.DEFAULT_MODEL_RESOURCE;

    /**
     * Minimum z-score for the generative-model tiebreaker to consider a
     * candidate "language-like enough" to win. Candidates below this are
     * treated as mojibake.
     */
    private static final float MIN_GENERATIVE_ZSCORE = -4.0f;

    private static final GenerativeLanguageModel GLM;

    static {
        try {
            GLM = GenerativeLanguageModel.loadFromClasspath(GLM_RESOURCE);
        } catch (IOException e) {
            throw new RuntimeException("Failed to load generative language model: "
                    + GLM_RESOURCE, e);
        }
    }

    /**
     * Symmetric confusable peer groups: within each group, encoding variants
     * (e.g. ISO-8859-6 vs windows-1256) produce different decoded text for the
     * same byte sequence (unlike ISO-8859-1 vs windows-1252 which are functional
     * supersets). When the language-quality winner and a DECLARATIVE result
     * are in the same peer group, the language model cannot reliably
     * distinguish them ��� it merely reflects which variant happens to produce
     * Arabic (or Cyrillic, ���) n-grams its training data favoured.
     * In that case we prefer the explicit declaration.
     */
    private static final Map<String, Set<String>> PEER_GROUPS;

    static {
        Map<String, Set<String>> m = new HashMap<>();
        for (String[] group : new String[][] {
                {"ISO-8859-1",  "ISO-8859-15", "windows-1252"},
                {"ISO-8859-2",  "windows-1250"},
                {"ISO-8859-5",  "windows-1251"},
                {"KOI8-R",      "KOI8-U"},
                {"ISO-8859-6",  "windows-1256"},
                {"ISO-8859-7",  "windows-1253"},
                {"ISO-8859-8",  "windows-1255"},
                {"ISO-8859-9",  "windows-1254"},
                {"ISO-8859-13", "windows-1257"},
                {"ISO-8859-4",  "windows-1257"},
        }) {
            Set<String> s = new HashSet<>(Arrays.asList(group));
            for (String name : group) {
                m.put(name, s);
            }
        }
        PEER_GROUPS = Collections.unmodifiableMap(m);
    }

    private static boolean arePeers(Charset a, Charset b) {
        Set<String> peers = PEER_GROUPS.get(a.name());
        return peers != null && peers.contains(b.name());
    }

    private static boolean hasPositiveLangSignal(String text) {
        if (text == null || text.isEmpty()) {
            return false;
        }
        CharSoupLanguageDetector detector = new CharSoupLanguageDetector();
        char[] chars = text.toCharArray();
        detector.addText(chars, 0, chars.length);
        List<LanguageResult> results = detector.detectAll();
        return !results.isEmpty() && results.get(0).getRawScore() > 0f;
    }

    private int readLimit = DEFAULT_READ_LIMIT;

    @Override
    public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                       ParseContext parseContext) throws IOException {
        EncodingDetectorContext context =
                parseContext.get(EncodingDetectorContext.class);
        if (context == null || context.getResults().isEmpty()) {
            return Collections.emptyList();
        }

        Set<Charset> uniqueCharsets = context.getUniqueCharsets();

        Charset winner;
        if (uniqueCharsets.size() <= 1) {
            context.setArbitrationInfo("unanimous");
            winner = context.getResults().get(0).getCharset();
        } else {
            winner = arbitrate(tis, context, uniqueCharsets);
        }

        if (winner == null) {
            return Collections.emptyList();
        }
        float confidence = context.getTopConfidenceFor(winner);
        return List.of(new EncodingResult(winner, confidence));
    }

    private Charset arbitrate(TikaInputStream tis,
                              EncodingDetectorContext context,
                              Set<Charset> uniqueCharsets) throws IOException {
        EncodingDetectorContext.Result firstResult = context.getResults().get(0);

        if (tis == null) {
            context.setArbitrationInfo("no-stream");
            return firstResult.getCharset();
        }

        byte[] bytes = readBytes(tis);
        if (bytes == null || bytes.length == 0) {
            context.setArbitrationInfo("empty-stream");
            return firstResult.getCharset();
        }
        bytes = stripBomBytes(bytes);

        Map<Charset, String> candidates = new LinkedHashMap<>();
        for (Charset candidate : uniqueCharsets) {
            candidates.put(candidate, stripTags(decode(bytes, candidate)));
        }

        CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
        Charset bestCharset = langDetector.compareLanguageSignal(candidates);
        if (bestCharset == null) {
            // Discriminative model inconclusive. Try generative model as tiebreaker.
            Charset generativeWinner = generativeTiebreak(candidates);
            if (generativeWinner != null) {
                context.setArbitrationInfo("scored-inconclusive-generative-tiebreak");
                return generativeWinner;
            }

            // Generative model also inconclusive. When a DECLARATIVE result
            // (HTML meta charset, BOM, HTTP Content-Type) exists and decodes
            // the bytes at least as cleanly as the statistical fallback,
            // trust the declaration. This covers:
            //  ��� Pure-ASCII probe (both decodings identical) ��� prefer declared.
            //  ��� Probe with high bytes valid in BOTH charsets (e.g. Cyrillic
            //    in a page starting with ASCII JavaScript).
            Charset fallback = firstResult.getCharset();
            String fallbackDecoded = candidates.get(fallback);
            float fallbackJunk = fallbackDecoded != null
                    ? CharSoupLanguageDetector.junkRatio(fallbackDecoded) : 1f;

            Charset cleanerDeclared = null;
            for (EncodingDetectorContext.Result r : context.getResults()) {
                if (r.getResultType() == EncodingResult.ResultType.DECLARATIVE) {
                    String declaredDecoded = candidates.get(r.getCharset());
                    float declaredJunk = declaredDecoded != null
                            ? CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f;
                    if (declaredJunk <= fallbackJunk) {
                        cleanerDeclared = r.getCharset();
                        break;
                    }
                }
            }

            if (cleanerDeclared != null) {
                context.setArbitrationInfo("scored-inconclusive-use-cleaner-declared");
                return cleanerDeclared;
            }
            bestCharset = fallback;
        }

        // If a DECLARATIVE result (e.g. HTML meta charset) decodes the bytes to the same
        // string as the language-quality winner, prefer the declaration. This validates the
        // declared encoding against the actual bytes: if they are functionally equivalent,
        // trust the author's stated encoding. If they produce different text (a real conflict
        // ��� e.g. a lying BOM or a wrong meta tag), the bytes win and the language scorer's
        // choice stands.
        //
        // Additionally, when the winner and a DECLARATIVE charset are in the same confusable
        // peer group (e.g. ISO-8859-6 vs windows-1256) and the declared charset decodes
        // cleanly (low junk ratio), the language model cannot reliably distinguish them ���
        // they both produce valid same-script text. Prefer the explicit declaration.
        String winnerDecoded = candidates.get(bestCharset);
        float winnerJunk = winnerDecoded != null ? CharSoupLanguageDetector.junkRatio(winnerDecoded) : 1f;
        if (winnerDecoded != null) {
            for (EncodingDetectorContext.Result r : context.getResults()) {
                if (r.getResultType() == EncodingResult.ResultType.DECLARATIVE
                        && !r.getCharset().equals(bestCharset)) {
                    Charset declared = r.getCharset();
                    String declaredDecoded = candidates.get(declared);
                    if (declaredDecoded == null) {
                        continue;
                    }
                    if (declaredDecoded.equals(winnerDecoded)) {
                        context.setArbitrationInfo("scored-prefer-declared");
                        return declared;
                    }
                    float declaredJunk = CharSoupLanguageDetector.junkRatio(declaredDecoded);
                    // Same-script peer group: language model cannot distinguish variants
                    // (e.g. ISO-8859-6 vs windows-1256 both produce valid Arabic text).
                    // Prefer the declaration when it decodes at least as cleanly as the winner.
                    if (arePeers(bestCharset, declared) && declaredJunk <= winnerJunk) {
                        context.setArbitrationInfo("scored-prefer-declared-peer");
                        return declared;
                    }
                    // DECLARATIVE result decodes cleanly and has a positive language signal:
                    // trust the declaration over the language-model winner. The language scorer
                    // can be fooled on short probes (e.g. 4 CJK code points from a wrong-endian
                    // UTF-16 decode score higher than "test" in English), but a DECLARATIVE
                    // charset that itself produces meaningful text is almost certainly correct.
                    // A lying BOM or wrong meta-tag would produce high junk (replacement chars),
                    // so the declaredJunk guard prevents false positives.
                    boolean hasDeclaredLangSignal = hasPositiveLangSignal(declaredDecoded);
                    if (declaredJunk <= winnerJunk && hasDeclaredLangSignal) {
                        context.setArbitrationInfo("scored-prefer-declared-positive-lang");
                        return declared;
                    }
                }
            }
        }

        context.setArbitrationInfo("scored");
        return bestCharset;
    }

    /**
     * Generative-model tiebreaker: for each candidate charset's decoded text,
     * detect the most likely language then compute its z-score. The charset
     * producing the highest z-score (closest to "real language") wins, provided
     * it exceeds {@link #MIN_GENERATIVE_ZSCORE}.
     *
     * @return the winning charset, or {@code null} if no candidate passes the
     *         threshold or all candidates decode to identical text
     */
    private static <K> K generativeTiebreak(Map<K, String> candidates) {
        if (candidates.isEmpty()) {
            return null;
        }

        // If all candidates decode to identical text, the generative model
        // cannot distinguish them ��� return null so the DECLARATIVE-preference
        // logic downstream can handle it (e.g. pure-ASCII where windows-1252
        // and UTF-8 produce the same bytes).
        if (allDecodingsIdentical(candidates)) {
            LOG.debug("generativeTiebreak: all decodings identical, deferring");
            return null;
        }

        float bestZ = Float.NEGATIVE_INFINITY;
        K bestKey = null;

        for (Map.Entry<K, String> entry : candidates.entrySet()) {
            String text = entry.getValue();
            if (text == null || text.isEmpty()) {
                continue;
            }
            if (CharSoupLanguageDetector.junkRatio(text) > 0.10f) {
                continue;
            }
            Map.Entry<String, Float> match = GLM.bestMatch(text);
            if (match == null) {
                continue;
            }
            float z = GLM.zScoreLengthAdjusted(text, match.getKey());
            LOG.debug("generativeTiebreak: {} -> lang={} z={}",
                    entry.getKey(), match.getKey(), z);
            if (!Float.isNaN(z) && z > bestZ) {
                bestZ = z;
                bestKey = entry.getKey();
            }
        }

        if (bestZ < MIN_GENERATIVE_ZSCORE) {
            LOG.debug("generativeTiebreak: inconclusive (bestZ={} < {})",
                    bestZ, MIN_GENERATIVE_ZSCORE);
            return null;
        }
        return bestKey;
    }

    private static <K> boolean allDecodingsIdentical(Map<K, String> candidates) {
        String first = null;
        for (String text : candidates.values()) {
            if (first == null) {
                first = text;
            } else if (!first.equals(text)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Strip any leading byte-order mark from {@code bytes}, returning the
     * suffix after the BOM, or the original array if no BOM is found.
     * UTF-32 signatures are checked before UTF-16 because the UTF-32 LE BOM
     * ({@code FF FE 00 00}) starts with the UTF-16 LE BOM ({@code FF FE}).
     */
    private static byte[] stripBomBytes(byte[] bytes) {
        return bomCharsetName(bytes) != null ? Arrays.copyOfRange(bytes, bomLength(bytes), bytes.length) : bytes;
    }

    /**
     * Return the Java charset name for a leading BOM, or {@code null} if none.
     */
    static String bomCharsetName(byte[] bytes) {
        if (bytes.length >= 4
                && (bytes[0] & 0xFF) == 0x00 && (bytes[1] & 0xFF) == 0x00
                && (bytes[2] & 0xFF) == 0xFE && (bytes[3] & 0xFF) == 0xFF) {
            return "UTF-32BE";
        }
        if (bytes.length >= 4
                && (bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE
                && (bytes[2] & 0xFF) == 0x00 && (bytes[3] & 0xFF) == 0x00) {
            return "UTF-32LE";
        }
        if (bytes.length >= 3
                && (bytes[0] & 0xFF) == 0xEF && (bytes[1] & 0xFF) == 0xBB
                && (bytes[2] & 0xFF) == 0xBF) {
            return "UTF-8";
        }
        if (bytes.length >= 2
                && (bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
            return "UTF-16BE";
        }
        if (bytes.length >= 2
                && (bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
            return "UTF-16LE";
        }
        return null;
    }

    private static int bomLength(byte[] bytes) {
        if (bytes.length >= 4
                && ((bytes[0] & 0xFF) == 0x00 || (bytes[0] & 0xFF) == 0xFF)
                && (bytes[2] & 0xFF) == 0x00) {
            return 4; // UTF-32
        }
        if (bytes.length >= 3 && (bytes[0] & 0xFF) == 0xEF) {
            return 3; // UTF-8
        }
        return 2; // UTF-16
    }

    private byte[] readBytes(TikaInputStream tis) throws IOException {
        try {
            tis.mark(readLimit);
            byte[] buf = new byte[readLimit];
            int totalRead = 0;
            int bytesRead;
            while (totalRead < readLimit &&
                    (bytesRead = tis.read(buf, totalRead,
                            readLimit - totalRead)) != -1) {
                totalRead += bytesRead;
            }
            if (totalRead == 0) {
                return null;
            }
            if (totalRead < readLimit) {
                byte[] trimmed = new byte[totalRead];
                System.arraycopy(buf, 0, trimmed, 0, totalRead);
                return trimmed;
            }
            return buf;
        } finally {
            tis.reset();
        }
    }

    /**
     * Decode bytes using the given charset, replacing malformed/unmappable
     * characters rather than throwing.
     */
    static String decode(byte[] bytes, Charset charset) {
        CharsetDecoder decoder = charset.newDecoder()
                .onMalformedInput(CodingErrorAction.REPLACE)
                .onUnmappableCharacter(CodingErrorAction.REPLACE);
        CharBuffer cb = CharBuffer.allocate(bytes.length * 2);
        decoder.decode(ByteBuffer.wrap(bytes), cb, true);
        decoder.flush(cb);
        cb.flip();
        return cb.toString();
    }

    /**
     * Simple tag stripping: removes &lt;...&gt; sequences so that
     * HTML/XML tag names and attributes don't pollute language scoring.
     */
    static String stripTags(String text) {
        StringBuilder sb = new StringBuilder(text.length());
        boolean inTag = false;
        for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            if (c == '<') {
                inTag = true;
            } else if (c == '>') {
                inTag = false;
            } else if (!inTag) {
                sb.append(c);
            }
        }
        return sb.toString();
    }

    public int getReadLimit() {
        return readLimit;
    }

    public void setReadLimit(int readLimit) {
        this.readLimit = readLimit;
    }
}