MailDateParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.mailcommons;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;

import org.apache.james.mime4j.field.DateTimeFieldLenientImpl;
import org.junit.jupiter.api.Test;

public class MailDateParserTest {

    @Test
    public void testDateTimesWithTimeZones() throws Exception {
        String expected = "2016-05-09T01:32:00Z";

        //try with timezones
        for (String dateString : new String[] {
                // with timezone info:
                "Mon, 9 May 16 01:32:00 GMT",
                "9 May 16 01:32:00 GMT",
                "Monday, 9 May 16 01:32:00 GMT",
                "Mon, 9 May 2016 01:32:00 UTC",
                "9 May 2016 01:32:00 UTC",
                "09 May 2016 01:32:00 UTC",
                "Mon, 9 May 2016 01:32:00Z",
                "Mon, 9 May 2016 01:32:00 Z",
                "Mon, 9 May 2016 01:32:00 GMT",
                "Mon, 9 May 2016 01:32:00GMT",
                "Mon, 9 May 2016 01:32:00 UTC",
                "Mon, 9 May 2016 01:32:00UTC",

                "Mon, 9 May 2016 3:32:00 GMT+0200",
                "Mon, 9 May 2016 3:32:00 UTC+0200",
                "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",

                //try with leading space
                //"      Mon, 9 May 2016 3:32:00 +0200",
                "       9 May 2016 3:32:00 +0200",
                "Mon, 9 May 2016 3:32:00 +02:00",
                "9 May 2016 3:32:00 +02:00",
                "Mon, 9 May 2016 3:32:00+02:00",
                "Mon, 9 May 2016 3:32:00+0200",
                "      Sun, 8 May 2016 21:32:00 EST",
                //need to add am/pm format times?  I hope not.

        }) {
            testDate(dateString, expected, true);
        }
    }

    @Test// for dev purposes
    public void oneOff() throws Exception {
  /*      SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss z");
        System.out.println(simpleDateFormat.format(new Date()));
        DateTimeFormatter formatter = DateTimeFormatter
                .ofPattern("yyyy-MM-dd'T'HH:mm:ss.S OOOO")
                .withLocale(Locale.US);
        String date = formatter.format(ZonedDateTime.now(ZoneOffset.UTC));
        System.out.println("String: " + date);
        System.out.println("parsed: " + formatter.parse(date) + " from " + date);
*/
        String s = "Mon, 6 Sep 2010 05:25:34 -0400 (EDT)";
        s = "Tue, 9 Jun 2009 23:58:45 -0400";

        //System.out.println(RFC)
        try {
            System.out.println("mime4j: " + DateTimeFieldLenientImpl.RFC_5322.parse(s));
        } catch (Exception e) {
            System.out.println("mime4j: null");
        }
        try {
            Date d = MailDateParser.parseDateLenient(s);
            DateFormat df =
                    new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
            df.setTimeZone(TimeZone.getTimeZone("UTC"));
            String dateString = df.format(d);
            System.out.println("dev parser lenient: " + dateString);
        } catch (Exception e) {
            System.out.println("dev parser lenient: null");
        }
    }

    @Test
    public void testDateTimesWithNoTimeZone() throws Exception {
        String expected = "2016-05-09T01:32:00Z";

        for (String dateString : new String[]{
                "Mon, 9 May 2016 01:32:00",
                "Monday, 9 May 2016 1:32 AM", "May 9 2016 1:32am", "May 9 2016 1:32 am",
                "2016-05-09 01:32:00"}) {
            testDate(dateString, expected, true);
        }
    }

    @Test
    public void testDates() throws Exception {
        //now try days without times
        String expected = "2016-05-15T12:00:00Z";
        for (String dateString : new String[]{
                "May 15, 2016", "Sun, 15 May 2016", "15 May 2016",
                "2016-05-15"
        }) {
            testDate(dateString, expected, true);

        }
    }

    @Test
    public void testTrickyDates() throws Exception {
        DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
        //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
        Date date1980 = df.parse("1980-01-01");
        Date date2010 = df.parse("2010-01-01");
        for (String dateString : new String[]{
                "11/14/08",
                "1/14/08",
                "1/2/08",
                "12/1/2008",
                "12/02/1996",
                "96/1/02",
                "96/12/02",
                "96/12/2",
                "1996/12/02",
                "Mon, 29 Jan 96 14:02 GMT",
                "7/20/95 1:12PM",
                "08/14/2000  12:48 AM",
                "8/4/2000  1:48 AM",
                "06/24/2008, Tuesday, 11 AM",
                }) {
            Date parsedDate = MailDateParser.parseDateLenient(dateString);
            assertNotNull(parsedDate);
            if (parsedDate != null) {
                assertTrue(parsedDate.getTime() > date1980.getTime(),
                        "date must be after 1980:" + dateString + " >> + " +
                                parsedDate);
                assertTrue(parsedDate.getTime() < date2010.getTime(),
                        "date must be before 2020: " + dateString + " >> + " +
                                parsedDate);
            }
        }
        //TODO: mime4j misparses these to pre 1980 dates
        //"Wed, 27 Dec 95 11:20:40 EST",
        //"26 Aug 00 11:14:52 EDT"
        //
        //We are still misparsing: 8/1/03 to a pre 1980 date

    }

    @Test
    public void testNormalization() throws Exception {
        String s = "10-10-2022";
        //make sure that the year does not have ":" inserted
        assertEquals(s, MailDateParser.normalize(s));
    }

    private void testDate(String dateString, String expected, boolean useUTC) throws Exception {
        Date parsedDate = MailDateParser.parseDateLenient(dateString);
        assertNotNull(parsedDate, "couldn't parse " + dateString);
        DateFormat df =
                new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
        if (useUTC) {
            df.setTimeZone(TimeZone.getTimeZone("UTC"));
        }
        String parsedDateString = df.format(parsedDate);
        assertEquals(expected, parsedDateString, "failed to match: " + dateString);
    }

}