CharSoupEncodingDetectorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.langdetect.charsoup;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.ByteArrayInputStream;
import java.nio.charset.Charset;
import java.util.List;

import org.junit.jupiter.api.Test;

import org.apache.tika.detect.EncodingDetectorContext;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.MetaEncodingDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

public class CharSoupEncodingDetectorTest {

    @Test
    public void testIsMetaEncodingDetector() {
        assertTrue(new CharSoupEncodingDetector() instanceof MetaEncodingDetector);
    }

    @Test
    public void testUnanimous() throws Exception {
        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        EncodingDetectorContext context = new EncodingDetectorContext();
        context.addResult(List.of(new EncodingResult(UTF_8, 0.9f)), "DetectorA");
        context.addResult(List.of(new EncodingResult(UTF_8, 0.8f)), "DetectorB");

        ParseContext parseContext = new ParseContext();
        parseContext.set(EncodingDetectorContext.class, context);

        byte[] data = "Hello, world!".getBytes(UTF_8);
        try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(data))) {
            List<EncodingResult> result = detector.detect(tis, new Metadata(), parseContext);
            assertEquals(UTF_8, result.get(0).getCharset());
            assertEquals("unanimous", context.getArbitrationInfo());
        }
    }

    @Test
    public void testNoContext() throws Exception {
        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        ParseContext parseContext = new ParseContext();

        byte[] data = "Test".getBytes(UTF_8);
        try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(data))) {
            List<EncodingResult> result = detector.detect(tis, new Metadata(), parseContext);
            assertTrue(result.isEmpty());
        }
    }

    @Test
    public void testEmptyResults() throws Exception {
        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        EncodingDetectorContext context = new EncodingDetectorContext();

        ParseContext parseContext = new ParseContext();
        parseContext.set(EncodingDetectorContext.class, context);

        byte[] data = "Test".getBytes(UTF_8);
        try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(data))) {
            List<EncodingResult> result = detector.detect(tis, new Metadata(), parseContext);
            assertTrue(result.isEmpty());
        }
    }

    @Test
    public void testArabicEncodingArbitration() throws Exception {
        // Arabic text encoded in windows-1256.
        // When decoded as UTF-8 it produces replacement chars / garbage.
        // When decoded as windows-1256 it produces valid Arabic.
        // The language detector should pick windows-1256.
        Charset windows1256 = Charset.forName("windows-1256");

        String arabicText =
                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 " +
                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
                "\u0628\u0633\u0644\u0627\u0645 \u0648\u0627\u0646\u0633\u062c\u0627\u0645. " +
                "\u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0639\u0631\u0628\u064a\u0629 " +
                "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " +
                "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a " +
                "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " +
                "\u0627\u0644\u0639\u0627\u0644\u0645 \u0648\u064a\u062a\u062d\u062b\u0647\u0627 " +
                "\u0623\u0643\u062b\u0631 \u0645\u0646 \u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " +
                "\u0645\u0644\u064a\u0648\u0646 \u0625\u0646\u0633\u0627\u0646.";
        byte[] arabicBytes = arabicText.getBytes(windows1256);

        EncodingDetectorContext context = new EncodingDetectorContext();
        context.addResult(List.of(new EncodingResult(UTF_8, 0.5f)), "HtmlEncodingDetector");
        context.addResult(List.of(new EncodingResult(windows1256, 0.8f)), "Icu4jEncodingDetector");

        ParseContext parseContext = new ParseContext();
        parseContext.set(EncodingDetectorContext.class, context);

        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(arabicBytes))) {
            List<EncodingResult> result = detector.detect(tis, new Metadata(), parseContext);
            assertEquals(windows1256, result.get(0).getCharset());
            assertEquals("scored", context.getArbitrationInfo());
        }
    }

    @Test
    public void testStreamResetAfterDetection() throws Exception {
        EncodingDetectorContext context = new EncodingDetectorContext();
        context.addResult(List.of(new EncodingResult(UTF_8, 0.9f)), "DetectorA");
        context.addResult(List.of(new EncodingResult(ISO_8859_1, 0.5f)), "DetectorB");

        ParseContext parseContext = new ParseContext();
        parseContext.set(EncodingDetectorContext.class, context);

        byte[] data = "Hello, world! This is a test of encoding detection.".getBytes(UTF_8);
        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(data))) {
            detector.detect(tis, new Metadata(), parseContext);

            // Verify stream is back at the start
            byte[] readBack = new byte[data.length];
            int bytesRead = tis.read(readBack);
            assertEquals(data.length, bytesRead);
            assertEquals("Hello, world! This is a test of encoding detection.",
                    new String(readBack, UTF_8));
        }
    }

    @Test
    public void testStripTags() {
        assertEquals("Hello world",
                CharSoupEncodingDetector.stripTags(
                        "<html><body>Hello world</body></html>"));
        assertEquals("no tags here",
                CharSoupEncodingDetector.stripTags("no tags here"));
        assertEquals("",
                CharSoupEncodingDetector.stripTags("<empty/>"));
    }

    @Test
    public void testDecode() {
        byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8);
        assertEquals("caf\u00e9",
                CharSoupEncodingDetector.decode(utf8Bytes, UTF_8));
    }

    @Test
    public void testReadLimitGetterSetter() {
        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
        assertEquals(16384, detector.getReadLimit());
        detector.setReadLimit(4096);
        assertEquals(4096, detector.getReadLimit());
    }

    @Test
    public void testJunkRatio() {
        // Clean text ��� no junk
        assertEquals(0f,
                CharSoupLanguageDetector.junkRatio("Hello, world!"), 0.001f);

        // U+FFFD replacement chars
        assertEquals(0.5f,
                CharSoupLanguageDetector.junkRatio("ab\uFFFD\uFFFD"), 0.001f);

        // C1 control chars (U+0080-U+009F are isISOControl)
        assertEquals(0.25f,
                CharSoupLanguageDetector.junkRatio("abc\u0080"), 0.001f);

        // \r and \n are ordinary whitespace ��� not junk
        assertEquals(0f,
                CharSoupLanguageDetector.junkRatio("hello world\r\n"), 0.001f);

        // Non-whitespace C1 control char mixed with ordinary whitespace
        assertEquals(1f / 14f,
                CharSoupLanguageDetector.junkRatio("hello world\r\n\u0080"), 0.001f);

        // Empty/null
        assertEquals(0f, CharSoupLanguageDetector.junkRatio(""), 0.001f);
        assertEquals(0f, CharSoupLanguageDetector.junkRatio(null), 0.001f);
    }
}