MailDateParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mailcommons;
import static java.time.ZoneOffset.UTC;
import static java.time.temporal.ChronoField.AMPM_OF_DAY;
import static java.time.temporal.ChronoField.DAY_OF_MONTH;
import static java.time.temporal.ChronoField.DAY_OF_WEEK;
import static java.time.temporal.ChronoField.HOUR_OF_AMPM;
import static java.time.temporal.ChronoField.HOUR_OF_DAY;
import static java.time.temporal.ChronoField.INSTANT_SECONDS;
import static java.time.temporal.ChronoField.MILLI_OF_SECOND;
import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
import static java.time.temporal.ChronoField.OFFSET_SECONDS;
import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
import static java.time.temporal.ChronoField.YEAR;
import static org.apache.tika.utils.DateUtils.MIDDAY;
import java.text.ParseException;
import java.text.ParsePosition;
import java.time.DateTimeException;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.time.format.SignStyle;
import java.time.temporal.ChronoField;
import java.time.temporal.TemporalAccessor;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.utils.StringUtils;
/**
* Dates in emails are a mess. There are at least two major date related bugs in JDK 8.
* And, I've found differing behavior, bug or not, between JDK 8 and JDK 11/17.
* This class does its best to parse date strings. It does have a US-based date bias.
* Please open a ticket to fix this as needed. We can also add overrides via the parser config
* to manage customization of date formats.
*
* This code does not spark joy especially given the diffs in behavior between jdk versions.
*
* At some point, we should probably try joda or, heaven forfend, a pile of regexes.
*/
public class MailDateParser {
//TIKA-1970 Mac Mail's format is GMT+1 so we need to check for hour only
//Also, there are numerous bugs in jdk 8 with localized offsets
//so we need to get rid of the GMT/UTC component (e.g. https://bugs.openjdk.org/browse/JDK-8154520)
private static final Pattern LOCALIZED_OFFSET_PATTERN =
Pattern.compile("(?:UTC|GMT)\\s*([-+])\\s*(\\d?\\d):?(\\d\\d)?\\Z");
//this is used to strip junk after a fairly full offset:
// Wed, 26 Jan 2022 09:14:37 +0100 (CET)
// Also insert colon to avoid, ahem, behavior that is different in jdk 11 and jdk 17 than jdk8
// with "Mon, 9 May 2016 3:32:00 +0200"
//we add the first pattern -\\d\\d-\\d\\d\\d\\d so that we skip over 10-10-2000 via
//the while loop.
private static final Pattern OFFSET_PATTERN =
Pattern.compile("(?:(?:-\\d\\d-\\d{4})|([-+])\\s*(\\d?\\d):?(\\d\\d))");
private static final Pattern DAYS_OF_WEEK =
Pattern.compile("(?:\\A| )(MON|MONDAY|TUE|TUES|TUESDAY|WED|WEDNESDAY|THU|THUR|THURS" +
"|THURSDAY|FRI|FRIDAY|SAT|SATURDAY|SUN|SUNDAY) ");
//find a time ending in am/pm without a space: 10:30am and
//use this pattern to insert space: 10:30 am
private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
//Taken nearly directly from mime4j
private static Map<Long, String> monthOfYear() {
HashMap<Long, String> result = new HashMap<>();
result.put(1L, "JAN");
result.put(2L, "FEB");
result.put(3L, "MAR");
result.put(4L, "APR");
result.put(5L, "MAY");
result.put(6L, "JUN");
result.put(7L, "JUL");
result.put(8L, "AUG");
result.put(9L, "SEP");
result.put(10L, "OCT");
result.put(11L, "NOV");
result.put(12L, "DEC");
return result;
}
private static Map<Long, String> dayOfWeek() {
HashMap<Long, String> result = new HashMap<>();
result.put(1L, "MON");
result.put(2L, "TUE");
result.put(3L, "WED");
result.put(4L, "THU");
result.put(5L, "FRI");
result.put(6L, "SAT");
result.put(7L, "SUN");
return result;
}
private static final int INITIAL_YEAR = 1970;
private static final DateTimeFormatter TIME_ZONE_FORMATTER
= new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.optionalStart()
.appendLiteral(' ') //optional space before any of the time zone offset/ids
.optionalEnd()
.optionalStart()
.appendZoneId()
.optionalEnd()
.optionalStart()
.appendPattern("X")//localized zone offset, e.g. Z; -08; -0830; -08:30; -083015; -08:30:15
.optionalEnd()
.optionalStart()
.appendPattern("z")//zone name, e.g. PST
.optionalEnd().toFormatter(Locale.US);
public static final DateTimeFormatter RFC_5322 = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.optionalStart()
.appendText(DAY_OF_WEEK, dayOfWeek())
.appendLiteral(", ")
.optionalEnd()
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NOT_NEGATIVE)
.appendLiteral(' ')
.appendText(MONTH_OF_YEAR, monthOfYear())
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(HOUR_OF_DAY, 2)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 2)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3)
.optionalEnd()
.optionalStart()
.appendLiteral(' ')
.appendOffset("+HHMM", "GMT")
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY, MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter RFC_5322_LENIENT = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendPattern("MMM")
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR,
HOUR_OF_DAY, MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
//this differs only from RFC_5322_LENIENT in requiring am/pm
public static final DateTimeFormatter RFC_5322_AMPM_LENIENT = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendPattern("MMM")
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.appendLiteral(' ') //optional space before am/pm
.optionalEnd()
.appendText(ChronoField.AMPM_OF_DAY)
.optionalStart()
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter MMM_D_YYYY_HH_MM_AM_PM = // "July 9 2012 10:10:10 am UTC"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendPattern("MMM")
.appendLiteral(' ')
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.appendLiteral(' ') //optional space before am/pm
.optionalEnd()
.appendText(ChronoField.AMPM_OF_DAY)
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter MMM_D_YYYY_HH_MM = // "July 9 2012 10:10:10 UTC"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendPattern("MMM")
.appendLiteral(' ')
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM = //
// US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
.appendLiteral('/')
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral('/')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM =
// US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10 AM UTC"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
.appendLiteral('/')
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral('/')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.appendLiteral(' ')
.appendValue(HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.appendLiteral(' ')
.optionalEnd()
.appendText(AMPM_OF_DAY)
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM,
AMPM_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter YYYY_MM_DD_HH_MM = // "2012-10-10 10:10:10 UTC"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(YEAR, 4)
.appendLiteral('-')
.appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
.appendLiteral('-')
.appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
.appendLiteral('.')
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
.append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
public static final DateTimeFormatter YYYY_MM_DD = // "2012-10-10"
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(YEAR, 4)
.appendLiteral('-')
.appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
.appendLiteral('-')
.appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
.withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR);
public static final DateTimeFormatter MM_SLASH_DD_SLASH_YYYY =
new DateTimeFormatterBuilder()
.appendPattern("M/d/")
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
.toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
public static final DateTimeFormatter DD_SLASH_MM_SLASH_YYYY =
new DateTimeFormatterBuilder()
.appendPattern("d/M/")
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
.toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
public static final DateTimeFormatter MMM_DD_YY =
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendPattern("MMM")
.appendLiteral(' ')
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.toFormatter(Locale.US);
public static final DateTimeFormatter DD_MMM_YY =
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendPattern("MMM")
.appendLiteral(' ')
.appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
.toFormatter(Locale.US);
public static final DateTimeFormatter YY_SLASH_MM_SLASH_DD =
new DateTimeFormatterBuilder()
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
.appendPattern("/M/d")
.toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
private static final DateTimeFormatter[] DATE_FORMATTERS = new DateTimeFormatter[] {
DD_MMM_YY,
MMM_DD_YY,
YYYY_MM_DD,
MM_SLASH_DD_SLASH_YYYY,//try American first?
DD_SLASH_MM_SLASH_YYYY,//if that fails, try rest of world?
YY_SLASH_MM_SLASH_DD
};
private static final DateTimeFormatter[] DATE_TIME_FORMATTERS = new DateTimeFormatter[] {
RFC_5322_LENIENT,
RFC_5322_AMPM_LENIENT,
MMM_D_YYYY_HH_MM,
MMM_D_YYYY_HH_MM_AM_PM,
YYYY_MM_DD_HH_MM,
MM_SLASH_DD_SLASH_YY_HH_MM,
MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM
};
public static Date parseRFC5322(String string) throws ParseException {
//this fails on: MON, 9 MAY 2016 3:32:00 GMT+0200 ... it stops short and doesn't include
// the +0200?!
if (string != null) {
string = string.trim();
string = string.toUpperCase(Locale.US);
}
return Date.from(Instant.from(RFC_5322.parse(string, new ParsePosition(0))));
}
public static Date parseDateLenient(String text) {
if (text == null) {
return null;
}
String normalized = normalize(text);
for (DateTimeFormatter dateTimeFormatter : DATE_TIME_FORMATTERS) {
try {
ZonedDateTime zonedDateTime = ZonedDateTime.parse(normalized, dateTimeFormatter);
return Date.from(Instant.from(zonedDateTime));
} catch (SecurityException e) {
throw e;
} catch (DateTimeParseException e) {
//There's a bug in java 8 that if we include .withZone in the DateTimeFormatter,
//that will override the offset/timezone id even if it included
// in the original string. This is fixed in later versions of Java.
// Once we move to Java 11, we can get rid of this. Can't make this up...
try {
LocalDateTime localDateTime = LocalDateTime.parse(normalized, dateTimeFormatter);
return Date.from(Instant.from(localDateTime.atOffset(UTC)));
} catch (SecurityException e2) {
throw e2;
} catch (Exception e2) {
//swallow
}
} catch (Exception e) {
//can get StringIndexOutOfBoundsException because of a bug in java 8
//ignore
}
}
for (DateTimeFormatter dateFormatter : DATE_FORMATTERS) {
try {
TemporalAccessor temporalAccessor = dateFormatter.parse(normalized);
ZonedDateTime localDate = LocalDate.from(temporalAccessor)
.atStartOfDay()
.atZone(MIDDAY.toZoneId());
return Date.from(Instant.from(localDate));
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
//ignore
}
}
return null;
}
private static boolean hasInstantSeconds(TemporalAccessor temporalAccessor) {
try {
temporalAccessor.getLong(INSTANT_SECONDS);
return true;
} catch (DateTimeException e) {
return false;
}
}
protected static String normalize(String text) {
text = text.toUpperCase(Locale.US);
//strip out commas
text = text.replaceAll(",", "");
//1) strip off extra stuff after +0800, e.g. "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",
//2) insert a colon btwn hrs and minutes to avoid a difference in behavior
// between jdk 8 and jdk 11+17
Matcher matcher = OFFSET_PATTERN.matcher(text);
while (matcher.find()) {
if (matcher.group(1) != null) {
text = text.substring(0, matcher.start());
text += matcher.group(1) + StringUtils.leftPad(matcher.group(2), 2, '0') + ":" +
matcher.group(3);
break;
}
}
matcher = LOCALIZED_OFFSET_PATTERN.matcher(text);
if (matcher.find()) {
text = buildLocalizedOffset(matcher, text);
}
matcher = AM_PM.matcher(text);
if (matcher.find()) {
text = matcher.replaceFirst("$1 $2");
}
//The rfc_lenient parser had a problem parsing dates
//with days of week missing and a timezone: 9 May 2016 01:32:00 UTC
//The day of week is not used in the resolvers, so we may as well throw
//out that info
matcher = DAYS_OF_WEEK.matcher(text);
if (matcher.find()) {
text = matcher.replaceAll(" ");
}
//16 May 2016 at 09:30:32 GMT+1
text = text.replaceAll("(?i) at ", " ");
//just cause
text = text.replaceAll("\\s+", " ").trim();
return text;
}
private static String buildLocalizedOffset(Matcher matcher, String text) {
StringBuilder sb = new StringBuilder();
sb.append(text.substring(0, matcher.start()));
sb.append(matcher.group(1));// +/-
sb.append(StringUtils.leftPad(matcher.group(2), 2, '0'));//HH
sb.append(":");
if (matcher.group(3) != null) {
sb.append(matcher.group(3));
} else {
sb.append("00");
}
sb.append(text.substring(matcher.end()));
return sb.toString();
}
}