NameDetectorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Test cases for the {@link NameDetector} class.
 */
public class NameDetectorTest {

    private Detector detector;

    @BeforeEach
    public void setUp() {
        Map<Pattern, MediaType> patterns = new HashMap<>();
        patterns.put(Pattern.compile(".*\\.txt", Pattern.CASE_INSENSITIVE), MediaType.TEXT_PLAIN);
        patterns.put(Pattern.compile("README"), MediaType.TEXT_PLAIN);
        patterns.put(Pattern.compile(".*\\.hdr"), MediaType.application("envi.hdr"));
        detector = new NameDetector(patterns);
    }

    @Test
    public void testDetect() {
        assertDetect(MediaType.TEXT_PLAIN, "text.txt");
        assertDetect(MediaType.TEXT_PLAIN, "text.txt ");    // trailing space
        assertDetect(MediaType.TEXT_PLAIN, "text.txt\n");   // trailing newline
        assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query
        assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment
        assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt");   // URL encoded
        assertDetect(MediaType.TEXT_PLAIN, "text.TXT");     // case insensitive
        assertDetect(MediaType.OCTET_STREAM, "text.txt.gz");

        assertDetect(MediaType.TEXT_PLAIN, "README");
        assertDetect(MediaType.TEXT_PLAIN, " README ");     // space around
        assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n");   // other whitespace
        assertDetect(MediaType.TEXT_PLAIN, "/a/README");    // leading path
        assertDetect(MediaType.TEXT_PLAIN, "\\b\\README");  // windows path
        assertDetect(MediaType.OCTET_STREAM, "ReadMe");     // case sensitive
        assertDetect(MediaType.OCTET_STREAM, "README.NOW");

        // TIKA-1928 # in the filename
        assertDetect(MediaType.TEXT_PLAIN, "text.txt");
        assertDetect(MediaType.TEXT_PLAIN, "text#.txt");   // # before extension
        assertDetect(MediaType.TEXT_PLAIN, "text#123.txt");// # before extension
        assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf");// # after extension

        // TIKA-3783 # before the final .
        assertDetect(MediaType.TEXT_PLAIN, "ABC#192.168.0.1#2.txt");

        // Check # as URL fragment too
        assertDetect(MediaType.TEXT_PLAIN, "http://foo/test.txt?1=2#pdf");
        assertDetect(MediaType.TEXT_PLAIN, "http://foo/test.txt#pdf");

        // tough one
        assertDetect(MediaType.TEXT_PLAIN, " See http://www.example.com:1234/README.txt?a=b#c \n");
        assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
        assertDetect(MediaType.OCTET_STREAM, "See README");   // but not this

        assertDetect(MediaType.application("envi.hdr"), "ang20150420t182050_corr_v1e_img.hdr");

        // test also the zero input cases
        assertDetect(MediaType.OCTET_STREAM, "");
        assertDetect(MediaType.OCTET_STREAM, null);
        try {
            assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata(), new ParseContext()));
        } catch (IOException e) {
            fail("NameDetector should never throw an IOException");
        }
    }

    private void assertDetect(MediaType type, String name) {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
        try {
            assertEquals(type, detector.detect(null, metadata, new ParseContext()));
        } catch (IOException e) {
            fail("NameDetector should never throw an IOException");
        }
    }
}