MagicDetectorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;

import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Test cases for the {@link MagicDetector} class.
 */
public class MagicDetectorTest extends TikaTest {

    @Test
    public void testDetectNull() throws Exception {
        MediaType html = new MediaType("text", "html");
        Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));
        assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata(), new ParseContext()));
    }

    @Test
    public void testDetectSimple() throws Exception {
        MediaType html = new MediaType("text", "html");
        Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));

        assertDetect(detector, html, "<html");
        assertDetect(detector, html, "<html><head/><body/></html>");
        assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
        assertDetect(detector, MediaType.OCTET_STREAM, "<?xml?><html");
        assertDetect(detector, MediaType.OCTET_STREAM, " <html");
        assertDetect(detector, MediaType.OCTET_STREAM, "");
    }

    @Test
    public void testDetectOffsetRange() throws Exception {
        MediaType html = new MediaType("text", "html");
        Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII), null, 0, 64);

        assertDetect(detector, html, "<html");
        assertDetect(detector, html, "<html><head/><body/></html>");
        assertDetect(detector, html, "<?xml?><html/>");
        assertDetect(detector, html, "\n    <html");
        assertDetect(detector, html, "\u0000<html");
        assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
        assertDetect(detector, MediaType.OCTET_STREAM, " html");
        assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");

        assertDetect(detector, html,
                "0........1.........2.........3.........4.........5.........6" + "1234<html");
        assertDetect(detector, MediaType.OCTET_STREAM,
                "0........1.........2.........3.........4.........5.........6" + "12345<html");

        assertDetect(detector, MediaType.OCTET_STREAM, "");
    }

    @Test
    public void testDetectMask() throws Exception {
        MediaType html = new MediaType("text", "html");
        byte up = (byte) 0xdf;
        Detector detector = new MagicDetector(html, new byte[]{'<', 'H', 'T', 'M', 'L'},
                new byte[]{(byte) 0xff, up, up, up, up}, 0, 64);

        assertDetect(detector, html, "<html");
        assertDetect(detector, html, "<HTML><head/><body/></html>");
        assertDetect(detector, html, "<?xml?><HtMl/>");
        assertDetect(detector, html, "\n    <html");
        assertDetect(detector, html, "\u0000<HTML");
        assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
        assertDetect(detector, MediaType.OCTET_STREAM, " html");

        assertDetect(detector, html,
                "0        1         2         3         4         5         6" + "1234<html");
        assertDetect(detector, MediaType.OCTET_STREAM,
                "0        1         2         3         4         5         6" + "12345<html");

        assertDetect(detector, MediaType.OCTET_STREAM, "");
    }

    @Test
    public void testDetectRegExPDF() throws Exception {
        MediaType pdf = new MediaType("application", "pdf");
        Detector detector =
                new MagicDetector(pdf, "(?s)\\A.{0,144}%PDF-".getBytes(US_ASCII), null, true, 0, 0);

        assertDetect(detector, pdf, "%PDF-1.0");
        assertDetect(detector, pdf, "0        10        20        30        40        50        6" +
                "0        70        80        90        100       110       1" +
                "20       130       140" + "34%PDF-1.0");
        assertDetect(detector, MediaType.OCTET_STREAM,
                "0        10        20        30        40        50        6" +
                        "0        70        80        90        100       110       1" +
                        "20       130       140" + "345%PDF-1.0");
        assertDetect(detector, MediaType.OCTET_STREAM, "");
    }

    @Test
    public void testDetectRegExGreedy() throws Exception {
        String pattern = "(?s)\\x3chtml xmlns=\"http://www\\.w3\\.org/1999/xhtml" +
                "\".*\\x3ctitle\\x3e.*\\x3c/title\\x3e";
        MediaType xhtml = new MediaType("application", "xhtml+xml");
        Detector detector =
                new MagicDetector(xhtml, pattern.getBytes(US_ASCII), null, true, 0, 8192);

        assertDetect(detector, xhtml, "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
                "<head><title>XHTML test document</title></head>");
    }

    @Test
    public void testDetectRegExOptions() throws Exception {
        String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " +
                "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01";

        String data = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"" +
                "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
                "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";

        String data1 = "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\"" +
                "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
                "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";

        String data2 = "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\"" +
                "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
                "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";

        MediaType html = new MediaType("text", "html");
        Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0);

        assertDetect(detector, html, data);
        assertDetect(detector, html, data1);
        assertDetect(detector, MediaType.OCTET_STREAM, data2);
    }

    @Test
    public void testDetectStreamReadProblems() throws Exception {
        byte[] data = "abcdefghijklmnopqrstuvwxyz0123456789".getBytes(US_ASCII);
        MediaType testMT = new MediaType("application", "test");
        Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
        // Deliberately prevent InputStream.read(...) from reading the entire
        // buffer in one go
        try (TikaInputStream tis = TikaInputStream.get(new RestrictiveInputStream(data))) {
            assertEquals(testMT, detector.detect(tis, new Metadata(), new ParseContext()));
        }
    }

    @Test
    public void testDetectApplicationEnviHdr() throws Exception {
        InputStream iStream = getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr");
        byte[] data = IOUtils.toByteArray(iStream);
        MediaType testMT = new MediaType("application", "envi.hdr");
        Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
        // Deliberately prevent InputStream.read(...) from reading the entire
        // buffer in one go
        try (TikaInputStream tis = TikaInputStream.get(new RestrictiveInputStream(data))) {
            assertEquals(testMT, detector.detect(tis, new Metadata(), new ParseContext()));
        }
    }

    @Test
    public void testDetectString() throws Exception {
        String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
        MediaType testMT = new MediaType("application", "test");
        Detector detector;

        // Check regular String matching
        detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
        assertDetect(detector, testMT, data.getBytes(US_ASCII));
        detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
        assertDetect(detector, testMT, data.getBytes(US_ASCII));

        // Check Little Endian and Big Endian utf-16 strings
        detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
        assertDetect(detector, testMT, data.getBytes(UTF_16LE));
        detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
        assertDetect(detector, testMT, data.getBytes(UTF_16BE));

        // Check case ignoring String matching
        detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
        assertDetect(detector, testMT, data.getBytes(US_ASCII));
    }

    private void assertDetect(Detector detector, MediaType type, String data) {
        byte[] bytes = data.getBytes(US_ASCII);
        assertDetect(detector, type, bytes);
    }

    private void assertDetect(Detector detector, MediaType type, byte[] bytes) {
        try {
            TikaInputStream tis = TikaInputStream.get(bytes);
            assertEquals(type, detector.detect(tis, new Metadata(), new ParseContext()));

            // Test that the stream has been reset
            for (byte aByte : bytes) {
                assertEquals(aByte, (byte) tis.read());
            }
            assertEquals(-1, tis.read());
        } catch (IOException e) {
            fail("Unexpected exception from MagicDetector");
        }
    }

    /**
     * InputStream class that does not read in all available bytes in
     * one go.
     */
    private static class RestrictiveInputStream extends ByteArrayInputStream {
        public RestrictiveInputStream(byte[] buf) {
            super(buf);
        }

        /**
         * Prevent reading the entire len of bytes if requesting more
         * than 10 bytes.
         */
        public int read(byte[] b, int off, int len) {
            if (len > 10) {
                return super.read(b, off, len - 10);
            } else {
                return super.read(b, off, len);
            }
        }
    }

    @Test
    public void testBZ2Detection() throws Exception {
        Detector detector = new DefaultDetector();
        for (String bz2 : new String[]{"bzip2-8-file.txt.bz2",
                "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2",
                "small-file.txt.bz2", "test-file-1.csv.bz2",
                "test-file-2.csv.bz2"}) {
            assertEquals("application/x-bzip2", detect(detector, bz2));
        }
    }

    private String detect(Detector detector, String bz2Name) throws IOException  {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/bz2/" + bz2Name)) {
            return detector.detect(tis, new Metadata(), new ParseContext()).toString();
        }
    }
}