RFC822ParserTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mail;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.james.mime4j.stream.MimeConfig;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
public class RFC822ParserTest extends TikaTest {
//legacy RFC822 behavior...extract every alternative part
private static Parser EXTRACT_ALL_ALTERNATIVES_PARSER;
private static TikaInputStream getStream(String name) {
InputStream stream =
Thread.currentThread().getContextClassLoader().getResourceAsStream(name);
assertNotNull(stream, "Test file not found " + name);
return TikaInputStream.get(stream);
}
@BeforeAll
public static void setUp() throws Exception {
EXTRACT_ALL_ALTERNATIVES_PARSER = TikaLoader.load(
getConfigPath(RFC822ParserTest.class,
"tika-config-extract-all-alternatives.json"))
.loadAutoDetectParser();
}
@Test
public void testSimple() throws Exception {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822");
ContentHandler handler = mock(DefaultHandler.class);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, new ParseContext());
verify(handler).startDocument();
//just one body
verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
//no multi-part body parts
verify(handler, never())
.startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"),
any(Attributes.class));
verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
verify(handler).endDocument();
//note no leading spaces, and no quotes
assertEquals("Julien Nioche (JIRA) <jira@apache.org>",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
metadata.get(TikaCoreProperties.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testExtendedToFromMetadata() throws Exception {
Metadata m = getXML("testRFC822").metadata;
assertEquals("Julien Nioche (JIRA)", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("jira@apache.org", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822-multipart").metadata;
assertEquals("DigitalPebble", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("julien@digitalpebble.com", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822_quoted").metadata;
assertEquals("Another Person", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("another.person@another-example.com", m.get(Message.MESSAGE_FROM_EMAIL));
m = getXML("testRFC822_i18nheaders").metadata;
assertEquals("Keld J��rn Simonsen", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("keld@dkuug.dk", m.get(Message.MESSAGE_FROM_EMAIL));
//this is currently detected as mbox!!!
m = getXML("testEmailWithPNGAtt.eml", new RFC822Parser()).metadata;
assertEquals("Tika Test", m.get(Message.MESSAGE_FROM_NAME));
assertEquals("XXXX@apache.org", m.get(Message.MESSAGE_FROM_EMAIL));
}
@Test
public void testMultipart() {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822-multipart");
ContentHandler handler = mock(XHTMLContentHandler.class);
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
verify(handler).startDocument();
int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
// TIKA-1422. TesseractOCRParser interferes with the
// number of times the handler is invoked But, different versions of Tesseract lead
// to a different number of invocations. So, we
// only verify the handler if Tesseract cannot run.
if (!TesseractOCRParserTest.canRun()) {
verify(handler, times(bodyExpectedTimes))
.startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"),
any(Attributes.class));
verify(handler, times(bodyExpectedTimes))
.endElement(XHTMLContentHandler.XHTML, "div", "div");
}
verify(handler, times(multipackExpectedTimes))
.startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
verify(handler, times(multipackExpectedTimes))
.endElement(XHTMLContentHandler.XHTML, "p", "p");
verify(handler).endDocument();
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
//repeat, this time looking at content
metadata = new Metadata();
tis = getStream("test-documents/testRFC822-multipart");
handler = new BodyContentHandler();
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("body 1"));
assertTrue(bodyText.contains("body 2"));
assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testQuotedPrintable() {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822_quoted");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
assertTrue(bodyText.contains("Lines can be split like this."));
assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
assertFalse(bodyText.contains("=")); //there should be no escape sequences
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testBase64() throws Exception {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822_base64");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
//need to pass in hint. Autodetects text/plain
metadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
//tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
assertContains("Here is some text, with international characters, voil\u00E0!",
handler.toString());
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
@Test
public void testI18NHeaders() {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822_i18nheaders");
ContentHandler handler = mock(DefaultHandler.class);
try {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, new ParseContext());
//tests correct decoding of internationalized headers, both
//quoted-printable (Q) and Base64 (B).
assertEquals("Keld J\u00F8rn Simonsen <keld@dkuug.dk>",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("If you can read this you understand the example.",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("If you can read this you understand the example.",
metadata.get(TikaCoreProperties.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
/**
* The from isn't in the usual form.
* See TIKA-618
*/
@Test
public void testUnusualFromAddress() throws Exception {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822_oddfrom");
ContentHandler handler = mock(DefaultHandler.class);
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, new ParseContext());
assertEquals("Saved by Windows Internet Explorer 7",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Air Permit Programs | Air & Radiation | US EPA",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("Air Permit Programs | Air & Radiation | US EPA",
metadata.get(TikaCoreProperties.SUBJECT));
}
/**
* Test for TIKA-640, increase header max beyond 10k bytes
*/
@Test
public void testLongHeader() throws Exception {
StringBuilder inputBuilder = new StringBuilder();
for (int i = 0; i < 2000; ++i) {
inputBuilder.append( //len > 50
"really really really really really really long name ");
}
String name = inputBuilder.toString();
byte[] data = ("Status: 520\r\nFrom: " + name + "\r\n\r\n").getBytes(US_ASCII);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream tis = TikaInputStream.get(data)) {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
fail();
} catch (TikaException expected) {
}
MimeConfig config = new MimeConfig.Builder().setMaxHeaderLen(-1).setMaxLineLen(-1).build();
context.set(MimeConfig.class, config);
try (TikaInputStream tis = TikaInputStream.get(data)) {
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
}
assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
}
/**
* Test for TIKA-678 - not all headers may be present
*/
@Test
public void testSomeMissingHeaders() throws Exception {
Metadata metadata = new Metadata();
TikaInputStream tis = getStream("test-documents/testRFC822-limitedheaders");
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, EXTRACT_ALL_ALTERNATIVES_PARSER);
EXTRACT_ALL_ALTERNATIVES_PARSER.parse(tis, handler, metadata, context);
assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
assertEquals("abcd", metadata.get(TikaCoreProperties.SUBJECT));
assertContains("bar biz bat", handler.toString());
}
/**
* TIKA-1222 When requested, ensure that the various attachments of
* the mail come through properly as embedded resources
*/
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
TrackingHandler tracker = new TrackingHandler();
ParseContext context = new ParseContext();
ContainerExtractor ex = new ParserContainerExtractor(
EXTRACT_ALL_ALTERNATIVES_PARSER, ((AutoDetectParser)EXTRACT_ALL_ALTERNATIVES_PARSER).getDetector());
try (TikaInputStream tis = TikaInputStream
.get(getStream("test-documents/testRFC822-multipart"))) {
assertEquals(true, ex.isSupported(tis, context));
ex.extract(tis, ex, tracker, context);
}
// Check we found all 3 parts
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
// No filenames available
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.filenames.get(1));
// Except for this using Content-Disposition filename field
assertEquals("logo.gif", tracker.filenames.get(2));
// Types are available
assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
@Test
public void testDetection() throws Exception {
//test simple text file
XMLResult r = getXML("testRFC822_date_utf8");
assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
//test without extension
r = getXML("testRFC822_eml");
assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testDates() throws Exception {
//tests non-standard dates that mime4j can't parse
XMLResult r = getXML("testRFC822_date_utf8");
assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
r = getXML("testRFC822_eml");
assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
}
@Test
public void testMultipleSubjects() throws Exception {
//adapted from govdocs1 303710.txt
String s = "From: Shawn Jones [chiroshawn@yahoo.com]\n" + "Subject: 2006N-3502\n" +
"Subject: I Urge You to Require Notice of Mercury";
Parser p = new RFC822Parser();
Metadata m = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8))) {
p.parse(tis, new DefaultHandler(), m, new ParseContext());
}
assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
}
@Test
public void testExtractAttachments() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testEmailWithPNGAtt.eml", EXTRACT_ALL_ALTERNATIVES_PARSER);
// Check we get the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadataList.get(3).get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadataList.get(3).get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(4, metadataList.size());
assertEquals("text/plain; charset=UTF-8", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("testPNG.png", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertContains("This email has a PNG attachment included in it",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals(null, metadataList.get(1).get(Metadata.CONTENT_DISPOSITION));
assertEquals("attachment; filename=\"testPNG.png\"",
metadataList.get(2).get(Metadata.CONTENT_DISPOSITION));
assertEquals("/Test Attachment Email.eml/embedded-1",
metadataList.get(1).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
assertEquals("/Test Attachment Email.eml/testPNG.png",
metadataList.get(2).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
}
@Test
public void testEmbeddedMetadata() throws Exception {
List<Metadata> seenMetadata =
getRecursiveMetadata("testRFC822-multipart", EXTRACT_ALL_ALTERNATIVES_PARSER);
assertEquals(4, seenMetadata.size());
assertEquals(null, seenMetadata.get(1).get(Metadata.CONTENT_DISPOSITION));
assertEquals("text/plain; charset=UTF-8", seenMetadata.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", seenMetadata.get(1).get(Metadata.CONTENT_ENCODING));
assertEquals(null, seenMetadata.get(2).get(Metadata.CONTENT_DISPOSITION));
assertEquals("text/html; charset=UTF-8", seenMetadata.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", seenMetadata.get(2).get(Metadata.CONTENT_ENCODING));
assertEquals("attachment; filename=\"logo.gif\"",
seenMetadata.get(3).get(Metadata.CONTENT_DISPOSITION));
assertEquals("logo.gif", seenMetadata.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("image/gif", seenMetadata.get(3).get(Metadata.CONTENT_TYPE));
}
@Test
public void testMultipartFlags() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testRFC822-multipart", EXTRACT_ALL_ALTERNATIVES_PARSER);
// Check the root metadata.
assertEquals("mixed", metadataList.get(0).get(Message.MULTIPART_SUBTYPE));
assertEquals("0016e64606800312ee04913db790",
metadataList.get(0).get(Message.MULTIPART_BOUNDARY));
// Check the metadata of the first alternative.
assertTrue(
metadataList.get(1).get(Metadata.CONTENT_TYPE).equals("text/plain; charset=UTF-8"));
assertTrue(metadataList.get(1).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
assertTrue(metadataList.get(1).get(Message.MULTIPART_BOUNDARY)
.equals("0016e64606800312ea04913db78e"));
// Check the metadata of the second alternative.
assertTrue(
metadataList.get(2).get(Metadata.CONTENT_TYPE).equals("text/html; charset=UTF-8"));
assertTrue(metadataList.get(2).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
assertTrue(metadataList.get(2).get(Message.MULTIPART_BOUNDARY)
.equals("0016e64606800312ea04913db78e"));
// Check the metadata of the attached GIF.
assertTrue(metadataList.get(3).get(Metadata.CONTENT_TYPE).equals("image/gif"));
assertEquals("mixed", metadataList.get(3).get(Message.MULTIPART_SUBTYPE));
assertEquals("0016e64606800312ee04913db790",
metadataList.get(3).get(Message.MULTIPART_BOUNDARY));
}
@Test
public void testBasicAlternativeBodyHandling() throws Exception {
/*
multi-part/mixed
multi-part/alternative
text
html
gif
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-multipart");
assertEquals(2, metadataList.size());
String body = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("body 2", body);
assertNotContained("body 1", body);
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("image/gif", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/logo.gif",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testMixedRelatedMultipart() throws Exception {
/*
multipart/mixed (..6)
multipart/related (..5)
multipart/alternative (..4)
text/plain
text/html
image/jpeg (inline) Mary with cooler.jpeg (..5)
image/jpeg (attachment) mary-coffee.jpg (..6)
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-simple");
assertEquals(3, metadataList.size());
assertContains("body 2", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("body 1", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/Mary with cooler.jpeg",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
assertEquals("image/jpeg", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("/mary-coffee.jpg",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
@Test
public void testAlternativeWithComplexMixedChild() throws Exception {
/*
This tests that both html body chunks are stitched back
together in the body text for the main email.
multi-part/alternative
text
multipart/mixed
html body chunk 1
pdf
html body chunk 2
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-with-pdf-inline");
assertEquals(2, metadataList.size());
String body = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("body 2", body);
assertContains("body 3", body);
assertNotContained("body 1", body);
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/tzora-titan-4-hummer-xl-manual.pdf",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testArc() throws Exception {
/*
This tests an email with ARC-* headers but that does not begin
with one, and was detected as HTML
*/
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-ARC");
assertEquals(1, metadataList.size());
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
//Also, test that this date has been parsed: Wed, 26 Jan 2022 09:14:37 +0100 (CET)
assertTrue(metadataList.get(0).get(TikaCoreProperties.CREATED).startsWith("2022-01-"));
}
@Test
public void testSimpleBodyInlined() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt");
assertEquals(1, metadataList.size());
assertContains("asked", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
@Test
public void testGroupwise() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testGroupWiseEml.eml");
assertEquals(3, metadataList.size());
assertContains("test<", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertContains("test2", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
assertEquals("/test.eml",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertContains("ssssss", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
assertEquals("/Neues Textdokument.txt",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testMultipartTextAttachment() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testRFC822_multipart_attachments.eml");
assertEquals(3, metadataList.size());
assertContains("This is the html body of the main msg.",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertContains("This is Test TXTA File for parser",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("This is Test TXTA File for parser",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("/Test TxtA.txt", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
//make sure we extracted creation and modified dates
assertTrue(metadataList.get(1).get(TikaCoreProperties.CREATED).startsWith("2022-11-"));
assertTrue(metadataList.get(1).get(TikaCoreProperties.MODIFIED).startsWith("2022-11-"));
assertContains("This is Test TXTB File for parser",
metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("This is Test TXTB File for parser",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("/Test TxtB.txt",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
//make sure we extracted creation and modified dates
assertTrue(metadataList.get(2).get(TikaCoreProperties.CREATED).startsWith("2022-11-"));
assertTrue(metadataList.get(2).get(TikaCoreProperties.MODIFIED).startsWith("2022-11-"));
}
}