TextDetectorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.IOException;
import java.util.Arrays;

import org.junit.jupiter.api.Test;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Test cases for the {@link TextDetector} class.
 */
public class TextDetectorTest {

    private final Detector detector = new TextDetector();

    @Test
    public void testDetectNull() throws Exception {
        assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata(), new ParseContext()));
    }

    /**
     * Test for type detection of empty documents.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
     */
    @Test
    public void testDetectEmpty() throws Exception {
        assertNotText(new byte[0]);
    }

    @Test
    public void testDetectText() throws Exception {
        assertText("Hello, World!".getBytes(UTF_8));
        assertText(" \t\r\n".getBytes(UTF_8));
        assertNotText(new byte[]{-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B});
        assertNotText(new byte[]{0});
        assertNotText(new byte[]{'H', 'e', 'l', 'l', 'o', 0});

        byte[] data = new byte[512];
        Arrays.fill(data, (byte) '.');
        assertText(data);
        Arrays.fill(data, 100, 110, (byte) 0x1f);
        assertText(data); // almost text
        Arrays.fill(data, 100, 111, (byte) 0x1f);
        assertNotText(data); // no longer almost text, too many control chars
        Arrays.fill(data, (byte) 0x1f);
        assertNotText(data);

        data = new byte[513];
        Arrays.fill(data, (byte) '.');
        data[0] = 0x1f;
        assertText(data);
        Arrays.fill(data, 100, 150, (byte) 0x83);
        assertText(data); // almost text
        Arrays.fill(data, 100, 200, (byte) 0x83);
        assertNotText(data); // no longer almost text, too many non-ASCII
        Arrays.fill(data, (byte) 0x1f);
        assertNotText(data);
    }

    private void assertText(byte[] data) {
        try {
            TikaInputStream tis = TikaInputStream.get(data);
            assertEquals(MediaType.TEXT_PLAIN, detector.detect(tis, new Metadata(), new ParseContext()));

            // Test that the stream has been reset
            for (byte aByte : data) {
                assertEquals(aByte, (byte) tis.read());
            }
            assertEquals(-1, tis.read());
        } catch (IOException e) {
            fail("Unexpected exception from TextDetector");
        }
    }

    private void assertNotText(byte[] data) {
        try (TikaInputStream tis = TikaInputStream.get(data)) {
            assertEquals(MediaType.OCTET_STREAM,
                    detector.detect(tis, new Metadata(), new ParseContext()));
        } catch (IOException e) {
            fail("Unexpected exception from TextDetector");
        }
    }

}