ProbabilisticMimeDetectionTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.mime;

import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;

public class ProbabilisticMimeDetectionTest {

    private ProbabilisticMimeDetectionSelector proDetector;

    private MediaTypeRegistry registry;

    /**
     * @inheritDoc
     */
    @BeforeEach
    public void setUp() {
        proDetector = new ProbabilisticMimeDetectionSelector();
        this.registry = proDetector.getMediaTypeRegistry();
    }

    @Test
    public void testDetection() throws Exception {
        testFile("image/svg+xml", "circles.svg");
        testFile("image/svg+xml", "circles-with-prefix.svg");
        testFile("image/png", "datamatrix.png");
        testFile("text/html", "test.html");
        testFile("application/xml", "test-iso-8859-1.xml");
        testFile("application/xml", "test-utf8.xml");
        testFile("application/xml", "test-utf8-bom.xml");
        testFile("application/xml", "test-utf16le.xml");
        testFile("application/xml", "test-utf16be.xml");
        testFile("application/xml", "test-long-comment.xml");
        testFile("application/xslt+xml", "stylesheet.xsl");
        testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
                "test-difficult-rdf1.xml");
        testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
        // add evil test from TIKA-327
        testFile("text/html", "test-tika-327.html");
        // add another evil html test from TIKA-357
        testFile("text/html", "testlargerbuffer.html");
        // test fragment of HTML with <div> (TIKA-1102)
        testFile("text/html", "htmlfragment");
        // test binary CGM detection (TIKA-1170)
        testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
        // test HTML detection of malformed file, previously identified as
        // image/cgm (TIKA-1170)
        testFile("text/html", "test-malformed-header.html.bin");
    }

    @Test
    public void testByteOrderMark() throws Exception {
        try (TikaInputStream tis = TikaInputStream.get("\ufefftest".getBytes(UTF_16LE))) {
            assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(tis, new Metadata(), new ParseContext()));
        }
        try (TikaInputStream tis = TikaInputStream.get("\ufefftest".getBytes(UTF_16BE))) {
            assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(tis, new Metadata(), new ParseContext()));
        }

        try (TikaInputStream tis = TikaInputStream.get("\ufefftest".getBytes(UTF_8))) {
            assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(tis, new Metadata(), new ParseContext()));
        }
    }

    @Test
    public void testSuperTypes() {
        assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
                MediaType.parse("text/something")));

        assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
                MediaType.TEXT_PLAIN));

        assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
                MediaType.OCTET_STREAM));

        assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"),
                MediaType.TEXT_PLAIN));

        assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"),
                MediaType.APPLICATION_XML));

        assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"),
                MediaType.APPLICATION_ZIP));

        assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN));

        assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"),
                MediaType.APPLICATION_ZIP));
    }

    @SuppressWarnings("unused")
    private void testUrlOnly(String expected, String url) throws IOException {
        InputStream in = new URL(url).openStream();
        testStream(expected, url, in);
    }

    private void testUrl(String expected, String url, String file) throws IOException {
        try (InputStream in = getClass().getResourceAsStream(file)) {
            testStream(expected, url, in);
        }
    }

    private void testFile(String expected, String filename) throws IOException {
        try (InputStream in = getClass().getResourceAsStream(filename)) {
            testStream(expected, filename, in);
        }
    }

    private void testStream(String expected, String urlOrFileName, InputStream in)
            throws IOException {
        assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!");
        try (TikaInputStream tis = TikaInputStream.get(in)) {
            Metadata metadata = new Metadata();
            String mime = this.proDetector.detect(tis, metadata, new ParseContext()).toString();
            assertEquals(expected, mime,
                    urlOrFileName + " is not properly detected: detected.");

            // Add resource name and test again
            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName);
            mime = this.proDetector.detect(tis, metadata, new ParseContext()).toString();
            assertEquals(expected, mime,
                    urlOrFileName + " is not properly detected after adding resource name.");
        }
    }

    /**
     * Test for type detection of empty documents.
     *
     * @see <a
     * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
     */
    @Test
    public void testEmptyDocument() throws IOException {
        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
            assertEquals(MediaType.OCTET_STREAM,
                    proDetector.detect(tis, new Metadata(), new ParseContext()));
        }

        Metadata namehint = new Metadata();
        namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
            assertEquals(MediaType.TEXT_PLAIN,
                    proDetector.detect(tis, namehint, new ParseContext()));
        }

        Metadata typehint = new Metadata();
        typehint.set(Metadata.CONTENT_TYPE, "text/plain");
        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
            assertEquals(MediaType.TEXT_PLAIN,
                    proDetector.detect(tis, typehint, new ParseContext()));
        }

    }

    /**
     * Test for things like javascript files whose content is enclosed in XML
     * comment delimiters, but that aren't actually XML.
     *
     * @see <a
     * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
     */
    @Test
    public void testNotXML() throws IOException {
        try (TikaInputStream tis = TikaInputStream.get("<!-- test -->".getBytes(UTF_8))) {
            assertEquals(MediaType.TEXT_PLAIN, proDetector.detect(tis, new Metadata(), new ParseContext()));
        }
    }

    /**
     * Tests that when we repeatedly test the detection of a document that can
     * be detected with Mime Magic, that we consistently detect it correctly.
     * See TIKA-391 for more details.
     */
    @Test
    public void testMimeMagicStability() throws IOException {
        for (int i = 0; i < 100; i++) {
            testFile("application/vnd.ms-excel", "test.xls");
        }
    }

    /**
     * Tests that when two magic matches both apply, and both have the same
     * priority, we use the name to pick the right one based on the glob, or the
     * first one we come across if not. See TIKA-1292 for more details.
     */
    @Test
    public void testMimeMagicClashSamePriority() throws IOException {
        byte[] helloWorld = "Hello, World!".getBytes(UTF_8);
        MediaType helloType = MediaType.parse("hello/world-file");
        MediaType helloXType = MediaType.parse("hello/x-world-hello");
        Metadata metadata;

        // With a filename, picks the right one
        metadata = new Metadata();
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.hello.world");
        try (TikaInputStream tis = TikaInputStream.get(helloWorld)) {
            assertEquals(helloType, proDetector.detect(tis, metadata, new ParseContext()));
        }

        metadata = new Metadata();
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world");
        try (TikaInputStream tis = TikaInputStream.get(helloWorld)) {
            assertEquals(helloXType, proDetector.detect(tis, metadata, new ParseContext()));
        }

        // Without, goes for the one that sorts last
        metadata = new Metadata();
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting");
        try (TikaInputStream tis = TikaInputStream.get(helloWorld)) {
            assertEquals(helloXType, proDetector.detect(tis, metadata, new ParseContext()));
        }
    }

    @Test
    public void testTIKA2237() throws IOException {
        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_TYPE, MediaType.text("javascript").toString());
        TikaInputStream tis = TikaInputStream.get(
                ("function() {};\n" + "try {\n" + "    window.location = 'index.html';\n" +
                        "} catch (e) {\n" + "    console.log(e);\n" + "}")
                        .getBytes(StandardCharsets.UTF_8));
        MediaType detect = new ProbabilisticMimeDetectionSelector().detect(tis, metadata, new ParseContext());
        assertEquals(MediaType.text("javascript"), detect);
    }
}