JpegParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.image;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.Arrays;
import java.util.List;
import java.util.TimeZone;

import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

public class JpegParserTest extends TikaTest {

    static TimeZone CURR_TIME_ZONE = TimeZone.getDefault();
    private final Parser parser = new JpegParser();

    @Test
    public void testJPEG() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_EXIF.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Core EXIF/TIFF tags
        assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
        assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));

        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));

        // Check that EXIF/TIFF tags come through with their raw values too
        // (This may be removed for Tika 1.0, as we support more of them
        //  with explicit Metadata entries)
        assertEquals("Canon EOS 40D", metadata.get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "Exif IFD0:Model"));

        // Common tags
        assertEquals("2009-10-02T23:02:49", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED),
                "Date/Time Original for when the photo was taken, unspecified time zone");
        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
        assertTrue(keywords.contains("canon-55-250"), "'canon-55-250' expected in " + keywords);
        assertTrue(keywords.contains("moscow-birds"), "'moscow-birds' expected in " + keywords);
        assertTrue(keywords.contains("serbor"), "'serbor' expected in " + keywords);
        assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
    }

    /**
     * Test for a file with Geographic information (lat, long etc) in it
     */
    @Test
    public void testJPEGGeo() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_GEO.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Geo tags
        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));

        // Core EXIF/TIFF tags
        assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
        assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));

        assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
        assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
        assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
        assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
        assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
        assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
        assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
        assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
        assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));

        // Common tags
        assertEquals("2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED),
                "Date/Time Original for when the photo was taken, unspecified time zone");
        assertEquals("2009-10-02T23:02:49", metadata.get(TikaCoreProperties.MODIFIED),
                "This image has different Date/Time than Date/Time Original, " +
                        "so it is probably modification date");
        assertEquals("2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE),
                "Date/Time Original should be stored in EXIF field too");
        assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.SUBJECT)[0]);
    }

    /**
     * Test for an image with the geographic information stored in a slightly
     * different way, see TIKA-915 for details
     * Disabled for now, pending a fix to the underlying library
     */
    @Test
    public void testJPEGGeo2() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // Geo tags should be there with 5dp, and not rounded
        assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
        assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));

        assertEquals("2012-02-20T16:44:22Z", metadata.get(Geographic.TIMESTAMP));
    }

    @Test
    public void testJPEGTitleAndDescription() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_commented.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // embedded comments with non-ascii characters
        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)",
                metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
        // Dublin Core
        // xmp handles spaces in keywords, returns "bird watching,
        // nature reserve, coast, grazelands"
        // but we have to replace them with underscore

        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
        assertTrue(keywords.contains("coast"));
        assertTrue(keywords.contains("bird watching"));

        // Core EXIF/TIFF tags
        assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
        assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));

        assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
        assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
        assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
        assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
        assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
        assertEquals(null, metadata.get(Metadata.SOFTWARE));
        assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));

        //ICC
        assertEquals("IEC", metadata.get("ICC:Device manufacturer").trim());
        assertNull(metadata.get("Device manufacturer"));

    }

    @Test
    public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // embedded comments with non-ascii characters
        assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)",
                metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
        List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
        assertTrue(keywords.contains("bird watching"), "got " + keywords);
    }

    @Test
    public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // XnViewMp's default comment dialog has only comment, not headline.
        // Comment is embedded only if "Write comments in XMP" is enabled in settings
        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)",
                metadata.get(TikaCoreProperties.DESCRIPTION));
        // xmp handles spaces in keywords, returns "bird watching, nature reserve,
        // coast, grazelands"
        // but we have to replace them with underscore
        String[] subject = metadata.getValues(TikaCoreProperties.SUBJECT);
        List<String> keywords = Arrays.asList(subject);
        assertTrue(keywords.contains("coast"), "'coast'" + " not in " + keywords);
        assertTrue(keywords.contains("nature reserve"), "'nature reserve'" + " not in " + keywords);
    }

    @Test
    public void testJPEGoddTagComponent() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
        assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
    }

    @Test
    public void testJPEGEmptyEXIFDateTime() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }
        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
        assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
    }

    @Test
    public void testJPEGXMPMM() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg")) {
            parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
        }

        // TODO: when jempbox is fixed/xmpbox is used
        // The bug in Jempbox is that it searches for children, not attributes. In this file it's attributes.
        // add tests for history...currently not extracted
        assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234", metadata.get(XMPMM.DOCUMENTID));
    }

}