TesseractOCRConfigTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ocr;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.util.Arrays;
import java.util.List;

import org.junit.jupiter.api.Test;

import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.parser.CompositeParser;

public class TesseractOCRConfigTest extends TikaTest {

    @Test
    public void testNoConfig() throws Exception {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertEquals("eng", config.getLanguage(), "Invalid default language value");
        assertEquals("1", config.getPageSegMode(), "Invalid default pageSegMode value");
        assertEquals(0, config.getMinFileSizeToOcr(), "Invalid default minFileSizeToOcr value");
        assertEquals(Integer.MAX_VALUE, config.getMaxFileSizeToOcr(), "Invalid default maxFileSizeToOcr value");
        assertEquals(120, config.getTimeoutSeconds(), "Invalid default timeout value");
        assertEquals(300, config.getDensity(), "Invalid default density value");
        assertEquals(4, config.getDepth(), "Invalid default depth value");
        assertEquals("gray", config.getColorspace(), "Invalid default colorpsace value");
        assertEquals("triangle", config.getFilter(), "Invalid default filter value");
        assertEquals(200, config.getResize(), "Invalid default resize value");
        assertEquals(false, config.isApplyRotation(), "Invalid default applyRotation value");
    }

    @Test
    public void testPartialConfig() throws Exception {
        TikaLoader loader = TikaLoader.load(
                getConfigPath(TesseractOCRConfigTest.class, "tika-config-tesseract-partial.json"));
        TesseractOCRParser parser =
                (TesseractOCRParser) ((CompositeParser) loader.loadParsers())
                        .getAllComponentParsers().get(0);
        TesseractOCRConfig config = parser.getDefaultConfig();
        assertEquals("fra+deu", config.getLanguage(), "Invalid overridden language value");
        assertEquals("1", config.getPageSegMode(), "Invalid default pageSegMode value");
        assertEquals(1, config.getMinFileSizeToOcr(), "Invalid overridden minFileSizeToOcr value");
        assertEquals(Integer.MAX_VALUE, config.getMaxFileSizeToOcr(), "Invalid default maxFileSizeToOcr value");
        assertEquals(240, config.getTimeoutSeconds(), "Invalid overridden timeout value");
        assertEquals(200, config.getDensity(), "Invalid overridden density value");
        assertEquals(8, config.getDepth(), "Invalid overridden depth value");
        assertEquals("box", config.getFilter(), "Invalid overridden filter value");
        assertEquals(300, config.getResize(), "Invalid overridden resize value");
        assertEquals(false, config.isApplyRotation(), "Invalid default applyRotation value");
    }

    @Test
    public void testFullConfig() throws Exception {
        TikaLoader loader = TikaLoader.load(
                getConfigPath(TesseractOCRConfigTest.class, "tika-config-tesseract-full.json"));
        TesseractOCRParser parser =
                (TesseractOCRParser) ((CompositeParser) loader.loadParsers())
                        .getAllComponentParsers().get(0);
        TesseractOCRConfig config = parser.getDefaultConfig();
        assertEquals("ceb", config.getLanguage(), "Invalid overridden language value");
        assertEquals("2", config.getPageSegMode(), "Invalid default pageSegMode value");
        assertEquals(1, config.getMinFileSizeToOcr(), "Invalid overridden minFileSizeToOcr value");
        assertEquals(2000000, config.getMaxFileSizeToOcr(), "Invalid default maxFileSizeToOcr " +
                "value");
        assertEquals(240, config.getTimeoutSeconds(), "Invalid overridden timeout value");
        assertEquals(200, config.getDensity(), "Invalid overridden density value");
        assertEquals(8, config.getDepth(), "Invalid overridden depth value");
        assertEquals("box", config.getFilter(), "Invalid overridden filter value");
        assertEquals(300, config.getResize(), "Invalid overridden resize value");
        assertEquals(true, config.isApplyRotation(), "Invalid default applyRotation value");
    }

    @Test
    public void testValidateValidLanguage() {
        List<String> validLanguages =
                Arrays.asList("eng", "slk_frak", "chi_tra", "eng+fra", "tgk+chi_tra+slk_frak",
                        "chi_tra_vert", "tgk+chi_tra_vert+slk_frak", "eng+script/Arabic",
                        "script/HanT_vert");

        TesseractOCRConfig config = new TesseractOCRConfig();

        for (String language : validLanguages) {
            config.setLanguage(language);
            assertEquals(language, config.getLanguage(), "Valid language not set");
        }
    }

    @Test
    public void testValidateInvalidLanguage() {
        List<String> invalidLanguages = Arrays.asList(
                //"", allow empty string
                "+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");

        TesseractOCRConfig config = new TesseractOCRConfig();

        for (String language : invalidLanguages) {
            try {
                config.setLanguage(language);
                fail("Invalid language set: " + language);
            } catch (IllegalArgumentException e) {
                // expected exception thrown
            }
        }
    }

    @Test
    public void testValidatePageSegMode() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setPageSegMode("0");
        config.setPageSegMode("10");
        assertTrue(true, "Couldn't set valid values");
        assertThrows(IllegalArgumentException.class, () -> {
            config.setPageSegMode("14");
        });
    }

    @Test
    public void testValidateDensity() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setDensity(300);
        config.setDensity(400);
        assertTrue(true, "Couldn't set valid values");
        assertThrows(IllegalArgumentException.class, () -> {
            config.setDensity(1);
        });
    }

    @Test
    public void testValidateDepth() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setDepth(4);
        config.setDepth(8);
        assertTrue(true, "Couldn't set valid values");
        assertThrows(IllegalArgumentException.class, () -> {
            config.setDepth(6);
        });
    }

    @Test
    public void testValidateFilter() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setFilter("Triangle");
        config.setFilter("box");
        assertTrue(true, "Couldn't set valid values");
        assertThrows(IllegalArgumentException.class, () -> {
            config.setFilter("abc");
        });
    }

    @Test
    public void testValidateResize() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setResize(200);
        config.setResize(400);
        assertTrue(true, "Couldn't set valid values");
        assertThrows(IllegalArgumentException.class, () -> {
            config.setResize(1000);
        });
    }

    @Test
    public void testDataPathCheck() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.setTessdataPath("blah\u0000deblah");
        });
    }

    @Test
    public void testPathCheck() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.setTesseractPath("blah\u0000deblah");
        });
    }

    @Test
    public void testBadOtherKey() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.addOtherTesseractConfig("bad bad", "bad");
        });
    }

    @Test
    public void testBadOtherValue() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.addOtherTesseractConfig("bad", "bad bad");
        });
    }

    @Test
    public void testBadOtherValueSlash() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.addOtherTesseractConfig("bad", "bad\\bad");
        });
    }

    @Test
    public void testBadOtherValueControl() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.addOtherTesseractConfig("bad", "bad\u0001bad");
        });
    }

    @Test
    public void testGoodOtherParameters() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.addOtherTesseractConfig("good", "good");
    }

    @Test
    public void testBadLanguageCode() throws Exception {
        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            tesseractOCRConfig.setLanguage("kerplekistani");
        });
    }

    @Test
    public void testBadColorSpace() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertThrows(IllegalArgumentException.class, () -> {
            config.setColorspace("someth!ng");
        });
    }
}