TestContainerAwareDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.microsoft.ooxml.OPCPackageDetector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.detect.zip.OpenDocumentDetector;
import org.apache.tika.detect.zip.StreamingZipContainerDetector;
import org.apache.tika.detect.zip.ZipContainerDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
import org.apache.tika.utils.XMLReaderUtils;
/**
* Junit test class for {@link org.apache.tika.detect.microsoft.POIFSContainerDetector}
*/
public class TestContainerAwareDetector extends MultiThreadedTikaTest {
private final MimeTypes mimeTypes = TikaLoader.getMimeTypes();
private final MediaTypeRegistry mediaTypeRegistry = mimeTypes.getMediaTypeRegistry();
private final Detector detector = new DefaultDetector(mimeTypes);
private final StreamingZipContainerDetector streamingZipDetector =
new StreamingZipContainerDetector();
@AfterEach
public void tearDown() throws TikaException {
//make sure to reset pool size because it is being randomly resized during the tests
XMLReaderUtils.setPoolSize(10);
}
private void assertTypeByData(String file, String type) throws Exception {
assertTypeByNameAndData(file, null, type);
}
private void assertTypeByNameAndData(String file, String type) throws Exception {
assertTypeByNameAndData(file, file, type);
}
private void assertType(String file, String byData, String byNameAndData) throws Exception {
assertTypeByData(file, byData);
assertTypeByNameAndData(file, byNameAndData);
}
private void assertTypeByNameAndData(String dataFile, String name, String type)
throws Exception {
assertTypeByNameAndData(dataFile, name, type, null);
}
private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector,
String typeFromMagic) throws Exception {
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/" + dataFile))) {
Metadata m = new Metadata();
if (name != null) {
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, name);
}
// Mime Magic version is likely to be less precise
if (typeFromMagic != null) {
assertEquals(MediaType.parse(typeFromMagic), mimeTypes.detect(tis, m, new ParseContext()));
}
MediaType expected = MediaType.parse(typeFromDetector);
// All being well, the detector should get it perfect
assertEquals(expected, detector.detect(tis, m, new ParseContext()));
if (mediaTypeRegistry.isSpecializationOf(expected, MediaType.APPLICATION_ZIP) &&
!expected.toString().contains("tika-ooxml-protected")) {
assertEquals(expected, streamingZipDetector.detect(tis, m, new ParseContext()),
"streaming zip detector failed");
}
}
}
@Test
public void testDetectOLE2() throws Exception {
/* // Microsoft office types known by POI
assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel");
assertTypeByData("testWORD.doc", "application/msword");
assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint");
assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook");
assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook");
assertTypeByData("testVISIO.vsd", "application/vnd.visio");*/
assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
// older Works Word Processor files can't be recognized
// they were created with Works Word Processor 7.0 (hence the text inside)
// and exported to the older formats with the "Save As" feature
assertTypeByData("testWORKSWordProcessor3.0.wps", "application/vnd.ms-works");
assertTypeByData("testWORKSWordProcessor4.0.wps", "application/vnd.ms-works");
assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
// Excel95 can be detected by not parsed
assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
// Try some ones that POI doesn't handle, that are still OLE2 based
assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro; version=9");
assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro; version=7-8");
assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5");
// With the filename and data
assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");
assertTypeByNameAndData("testWORD.doc", "application/msword");
assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint");
// With the wrong filename supplied, data will trump filename
assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel");
assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword");
assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint");
// With a filename of a totally different type, data will trump filename
assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel");
assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel");
assertTypeByData("testDGN8.dgn", "image/vnd.dgn; version=8");
}
@Test
@Disabled("until we have an actual hwpx example file in our repo")
public void testHWPX() throws Exception {
assertTypeByData("testHWPX.hwpx", "application/hwp+zip");
assertTypeByNameAndData("testHWPX.hwpx", "application/hwp+zip");
}
/**
* There is no way to distinguish "proper" StarOffice files from templates.
* All templates have the same extension but their actual type depends on
* the magic. Our current MimeTypes class doesn't allow us to use the same
* glob pattern in more than one mimetype.
*
* @throws Exception
*/
@Test
public void testDetectStarOfficeFiles() throws Exception {
assertType("testStarOffice-5.2-calc.sdc", "application/vnd.stardivision.calc",
"application/vnd.stardivision.calc");
assertType("testVORCalcTemplate.vor", "application/vnd.stardivision.calc",
"application/vnd.stardivision.calc");
assertType("testStarOffice-5.2-draw.sda", "application/vnd.stardivision.draw",
"application/vnd.stardivision.draw");
assertType("testVORDrawTemplate.vor", "application/vnd.stardivision.draw",
"application/vnd.stardivision.draw");
assertType("testStarOffice-5.2-impress.sdd", "application/vnd.stardivision.impress",
"application/vnd.stardivision.impress");
assertType("testVORImpressTemplate.vor", "application/vnd.stardivision.impress",
"application/vnd.stardivision.impress");
assertType("testStarOffice-5.2-writer.sdw", "application/vnd.stardivision.writer",
"application/vnd.stardivision.writer");
assertType("testVORWriterTemplate.vor", "application/vnd.stardivision.writer",
"application/vnd.stardivision.writer");
//file from open office bug tracker issue #6452
//star office >6.0
assertType("testStarOffice-6.0-writer.sxw", "application/vnd.sun.xml.writer",
"application/vnd.sun.xml.writer");
//ooo byg #5116
//can't find a diff in contents btwn sxw and stw...need to rely on file extension
assertTypeByNameAndData("testStarOffice-6.0-writer-template.stw",
"application/vnd.sun.xml.writer.template", "application/vnd.sun.xml.writer",
"application/zip");
//ooo bug #1151
assertType("testStarOffice-6.0-calc.sxc", "application/vnd.sun.xml.calc",
"application/vnd.sun.xml.calc");
//ooo bug #261
assertType("testStarOffice-6.0-draw.sxd", "application/vnd.sun.xml.draw",
"application/vnd.sun.xml.draw");
//ooo bug #5336
assertType("testStarOffice-6.0-draw.sxi", "application/vnd.sun.xml.impress",
"application/vnd.sun.xml.impress");
//ooo bug #67431 -- had to manually fix the name spacing in the manifest.xml
assertType("testOpenOffice-autotext.bau", "application/vnd.openofficeorg.autotext",
"application/vnd.openofficeorg.autotext");
//ooo bug #110760
assertType("testOpenOffice-extension.oxt", "application/vnd.openofficeorg.extension",
"application/vnd.openofficeorg.extension");
}
@Test
public void testOpenContainer() throws Exception {
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/testPPT.ppt"))) {
assertNull(tis.getOpenContainer());
assertEquals(MediaType.parse("application/vnd.ms-powerpoint"),
detector.detect(tis, new Metadata(), new ParseContext()));
assertTrue(tis.getOpenContainer() instanceof POIFSFileSystem);
}
}
/**
* EPub uses a similar mimetype entry to OpenDocument for storing
* the mimetype within the parent zip file
*/
@Test
public void testDetectEPub() throws Exception {
assertTypeByData("testEPUB.epub", "application/epub+zip");
assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip");
}
@Test
public void testDetectLotusNotesEml() throws Exception {
// Lotus .eml files aren't guaranteed to have any of the magic
// matches as the first line, but should have X-Notes-Item and Message-ID
assertTypeByData("testLotusEml.eml", "message/rfc822");
}
@Test
public void testDetectODF() throws Exception {
assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text");
assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula");
assertTypeByData("testODTnotaZipFile.odt", "text/plain");
}
@Test
public void test3MF() throws Exception {
assertTypeByData("test3mf.3mf", "application/vnd.ms-package.3dmanufacturing-3dmodel+xml");
assertTypeByNameAndData("test3mf.3mf", "application/vnd.ms-package.3dmanufacturing-3dmodel+xml");
}
@Test
public void testODFDifferentOrder() throws Exception {
//TIKA-3356
List<ZipContainerDetector> detectors = new ArrayList<>();
detectors.add(new OPCPackageDetector());
detectors.add(new OpenDocumentDetector());
DefaultZipContainerDetector zipContainerDetector = new DefaultZipContainerDetector(detectors);
try (TikaInputStream tis = TikaInputStream.get(
getResourceAsStream("/test-documents/testODFwithOOo3.odt"))) {
//force underlying file to test the proper behavior with the underlying zipfile
tis.getFile();
MediaType mt = zipContainerDetector.detect(tis, new Metadata(), new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text", mt.toString());
assertNotNull(tis.getOpenContainer());
assertEquals("org.apache.commons.compress.archivers.zip.ZipFile",
tis.getOpenContainer().getClass().getName());
}
}
@Test
public void testDetectOOXML() throws Exception {
assertTypeByData("testEXCEL.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertTypeByData("testWORD.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
assertTypeByData("testPPT.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
// Check some of the less common OOXML types
assertTypeByData("testPPT.pptm",
"application/vnd.ms-powerpoint.presentation.macroenabled.12");
assertTypeByData("testPPT.ppsx",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow");
assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
assertTypeByData("testEXCEL.strict.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertTypeByData("testEXCEL_macro_enabled_template.xltm",
"application/vnd.ms-excel.template.macroenabled.12");
assertTypeByData("testEXCEL_template.xltx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.template");
assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12");
assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12");
assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template");
// .xlsb is an OOXML file containing the binary parts, and not
// an OLE2 file as you might initially expect!
assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");
// With the filename and data
assertTypeByNameAndData("testEXCEL.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertTypeByNameAndData("testWORD.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
assertTypeByNameAndData("testPPT.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
// With the wrong filename supplied, data will trump filename
assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
assertTypeByNameAndData("testPPT.pptx", "notWord.docx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
// With an incorrect filename of a different container type, data trumps filename
assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
/**
* Password Protected OLE2 files are fairly straightforward to detect, as they
* have the same structure as regular OLE2 files. (Core streams may be encrypted
* however)
*/
@Test
public void testDetectProtectedOLE2() throws Exception {
assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
}
/**
* Password Protected OOXML files are much more tricky beasts to work with.
* They have a very different structure to regular OOXML files, and instead
* of being ZIP based they are actually an OLE2 file which contains the
* OOXML structure within an encrypted stream.
* This makes detecting them much harder...
*/
@Test
public void testDetectProtectedOOXML() throws Exception {
// Encrypted Microsoft Office OOXML files have OLE magic but
// special streams, so we can tell they're Protected OOXML
assertTypeByData("testEXCEL_protected_passtika.xlsx", "application/x-tika-ooxml-protected");
assertTypeByData("testWORD_protected_passtika.docx", "application/x-tika-ooxml-protected");
assertTypeByData("testPPT_protected_passtika.pptx", "application/x-tika-ooxml-protected");
// At the moment, we can't use the name to specialise
// See discussions on TIKA-790 for details
assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx",
"application/x-tika-ooxml-protected");
assertTypeByNameAndData("testWORD_protected_passtika.docx",
"application/x-tika-ooxml-protected");
assertTypeByNameAndData("testPPT_protected_passtika.pptx",
"application/x-tika-ooxml-protected");
}
/**
* Check that temporary files created by Tika are removed after
* closing TikaInputStream.
*/
@Test
public void testRemovalTempfiles() throws Exception {
assertRemovalTempfiles("testWORD.docx");
assertRemovalTempfiles("test-documents.zip");
}
private int countTemporaryFiles() {
//TODO: fix this. This can prevent multiple parallel builds
//from running at the same time because there can be more than one
//process writing to apache-tika-*
return Objects.requireNonNull(new File(System.getProperty("java.io.tmpdir"))
.listFiles((dir, name) -> name.startsWith("apache-tika-"))).length;
}
private void assertRemovalTempfiles(String fileName) throws Exception {
int numberOfTempFiles = countTemporaryFiles();
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/" + fileName))) {
detector.detect(tis, new Metadata(), new ParseContext());
}
assertEquals(numberOfTempFiles, countTemporaryFiles());
}
@Test
public void testDetectIWork() throws Exception {
assertTypeByData("testKeynote.key", "application/vnd.apple.keynote");
assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers");
assertTypeByData("testPages.pages", "application/vnd.apple.pages");
}
@Test
public void testDetectIWork2013() throws Exception {
assertTypeByData("testKeynote2013.key",
IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString());
// Without decoding the Document snappy stream, we can't tell the
// difference between these two just based on the zip entries
assertTypeByData("testNumbers2013.numbers",
IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
assertTypeByData("testPages2013.pages",
IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString());
}
@Test
public void testDetectIWork2018() throws Exception {
//file from libre office issue tracker, issue #123573
//manually removed jpegs for the sake of space*/
assertTypeByData("testKeynote2018.key",
IWork18PackageParser.IWork18DocumentType.KEYNOTE18.getType().toString());
//see https://bugs.documentfoundation.org/show_bug.cgi?id=120709 for a 2018 numbers file
//see https://bugs.documentfoundation.org/show_bug.cgi?id=120707 for a 2018 pages file
}
@Test
public void testDetectKMZ() throws Exception {
assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
}
@Test
public void testDetectIPA() throws Exception {
assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa");
assertTypeByData("testIPA.ipa", "application/x-itunes-ipa");
}
@Test
public void testASiC() throws Exception {
assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
}
@Test
public void testDetectZip() throws Exception {
assertTypeByData("test-documents.zip", "application/zip");
assertTypeByData("test-zip-of-zip.zip", "application/zip");
// JAR based formats
assertTypeByData("testJAR.jar", "application/java-archive");
assertTypeByData("testWAR.war", "application/x-tika-java-web-archive");
assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive");
assertTypeByData("testAPK.apk", "application/vnd.android.package-archive");
// JAR with HTML files in it
assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar",
"application/java-archive", "application/java-archive");
}
@Test
public void testTarWithNoMagic() throws Exception {
assertTypeByData("testTAR_no_magic.tar", "application/x-tar");
}
@Test
public void testLZMAOOM() throws Exception {
assertTypeByData("testLZMA_oom", "application/x-lzma");
}
@Test
@Disabled("find acceptable test file")
public void testLyr() throws Exception {
//file used in development but not added to
//repo: https://cmgds.marine.usgs.gov/publications/of2005-1346/arcgis/bathy/Bathymetry.lyr
assertTypeByNameAndData("testLyr.lyr", "x-esri-layer",
"application/x-esri-layer", "application/x-tika-msoffice");
}
@Test
public void testCompressOOM() throws Exception {
assertTypeByData("testZ_oom.Z", "application/x-compress");
}
private TikaInputStream getTruncatedFile(String name, int n) throws IOException {
try (InputStream input = getResourceAsStream("/test-documents/" + name)) {
byte[] bytes = new byte[n];
int m = 0;
while (m < bytes.length) {
int i = input.read(bytes, m, bytes.length - m);
if (i != -1) {
m += i;
} else {
throw new IOException("Unexpected end of stream");
}
}
return TikaInputStream.get(bytes);
}
}
@Test
public void testTruncatedFiles() throws Exception {
// First up a truncated OOXML (zip) file
// With only the data supplied, the best we can do is the container
Metadata m = new Metadata();
try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
assertEquals(MediaType.application("x-tika-ooxml"), detector.detect(xlsx, m, new ParseContext()));
}
// With truncated data + filename, we can use the filename to specialise
m = new Metadata();
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
assertEquals(
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
detector.detect(xlsx, m, new ParseContext()));
}
// Now a truncated OLE2 file
m = new Metadata();
try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
assertEquals(MediaType.application("x-tika-msoffice"), detector.detect(xls, m, new ParseContext()));
}
// Finally a truncated OLE2 file, with a filename available
m = new Metadata();
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testEXCEL.xls");
try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
assertEquals(MediaType.application("vnd.ms-excel"), detector.detect(xls, m, new ParseContext()));
}
}
@Test
public void testXMLMultiThreaded() throws Exception {
Detector detector = new Tika().getDetector();
FileFilter filter = pathname -> pathname.getName().endsWith(".xml");
int numThreads = 1;
XMLReaderUtils.setPoolSize(numThreads);
testDetector(detector, numThreads, 20, filter, numThreads * 2);
}
@Test
public void testAllMultithreaded() throws Exception {
Detector detector = new Tika().getDetector();
FileFilter filter = new FileFilter() {
//TODO: create proper randomized framework that will record seed, etc...
private final Random random = new Random();
//increase this to the number of files for a true smoke test
//for now, randomly pick 20 files.
int toProcess = 20;
int processed = 0;
@Override
public boolean accept(File pathname) {
if (processed >= toProcess) {
return false;
} else if (random.nextBoolean()) {
processed++;
return true;
}
return false;
}
};
int numThreads = 20;
XMLReaderUtils.setPoolSize(numThreads);
testDetector(detector, numThreads, 50, filter, numThreads * 3);
}
@Test
public void testOpenOfficeInAZip() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testOpenOfficeInAZip.zip");
assertEquals(3, metadataList.size());
assertEquals("application/vnd.oasis.opendocument.presentation",
metadataList.get(2).get(Metadata.CONTENT_TYPE));
}
@Test
public void testBPList() throws Exception {
assertTypeByData("testMemgraph.memgraph", "application/x-bplist-memgraph");
assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive");
assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
}
@Test
public void testPOIFSContainerDetector() throws Exception {
UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().get();
try (InputStream is = getResourceAsStream("/test-documents/testWORD.doc")) {
IOUtils.copy(is, baos);
}
byte[] bytes = baos.toByteArray();
long len = bytes.length;
//test default
Detector detector = TikaLoader.loadDefault().loadDetectors();
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
assertEquals("application/msword",
detector.detect(tis, new Metadata(), new ParseContext()).toString());
assertEquals(len, countBytes(tis));
}
}
/**
* Tests detection using all three TikaInputStream backing strategies:
* ByteArrayBackedStrategy, FileBackedStrategy, and StreamBackedStrategy.
*/
@Test
public void testDetectionAllBackingTypes() throws Exception {
// Test with various file types
assertDetectionAllBackingTypes("testWORD.doc", "application/msword");
assertDetectionAllBackingTypes("testEXCEL.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assertDetectionAllBackingTypes("testPPT.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
assertDetectionAllBackingTypes("testODFwithOOo3.odt",
"application/vnd.oasis.opendocument.text");
assertDetectionAllBackingTypes("testEPUB.epub", "application/epub+zip");
}
private void assertDetectionAllBackingTypes(String fileName, String expectedType)
throws Exception {
MediaType expected = MediaType.parse(expectedType);
String resourcePath = "/test-documents/" + fileName;
// Load file into byte array for testing
byte[] bytes;
try (InputStream is = getClass().getResourceAsStream(resourcePath)) {
UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().get();
IOUtils.copy(is, baos);
bytes = baos.toByteArray();
}
// Test 1: ByteArrayBackedStrategy (TikaInputStream.get(byte[]))
Metadata m1 = new Metadata();
m1.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
try (TikaInputStream tis = TikaInputStream.get(bytes, m1)) {
MediaType detected = detector.detect(tis, m1, new ParseContext());
assertEquals(expected, detected,
"ByteArrayBackedStrategy detection failed for " + fileName);
}
// Test 2: FileBackedStrategy (TikaInputStream.get(Path))
Path tempFile = Files.createTempFile("tika-test-", "-" + fileName);
try {
Files.write(tempFile, bytes);
Metadata m2 = new Metadata();
m2.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
try (TikaInputStream tis = TikaInputStream.get(tempFile, m2)) {
MediaType detected = detector.detect(tis, m2, new ParseContext());
assertEquals(expected, detected,
"FileBackedStrategy detection failed for " + fileName);
}
} finally {
Files.deleteIfExists(tempFile);
}
// Test 3: StreamBackedStrategy (TikaInputStream.get(InputStream))
Metadata m3 = new Metadata();
m3.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(bytes), m3)) {
MediaType detected = detector.detect(tis, m3, new ParseContext());
assertEquals(expected, detected,
"StreamBackedStrategy detection failed for " + fileName);
}
}
private long countBytes(InputStream is) throws IOException {
int b = is.read();
long len = 0;
while (b > -1) {
len++;
b = is.read();
}
return len;
}
/**
* Tests detection on truncated data using DetectUtils.getStreamForDetectionOnly().
* This simulates the scenario where a user wants to detect a file type without
* reading the entire file (e.g., for large files or streaming scenarios).
*/
@Test
public void testDetectionOnTruncatedData() throws Exception {
// Test OLE2 detection on truncated data (testWORD.doc)
testTruncatedDetection("testWORD.doc", 1024,
"application/x-tika-msoffice", true);
testTruncatedDetection("testWORD.doc", 4096,
"application/x-tika-msoffice", true);
// Test OLE2 detection on truncated data (testEXCEL.xls)
testTruncatedDetection("testEXCEL.xls", 1024,
"application/x-tika-msoffice", true);
// Test OOXML/ZIP detection on truncated data (testEXCEL.xlsx)
testTruncatedDetection("testEXCEL.xlsx", 300,
"application/x-tika-ooxml", true);
testTruncatedDetection("testEXCEL.xlsx", 1024,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", true);
// Test OOXML/ZIP detection on truncated data (testWORD.docx)
testTruncatedDetection("testWORD.docx", 1024,
"application/x-tika-ooxml", true);
// Test with full file - should NOT be marked as truncated
testTruncatedDetection("testWORD.doc", 1024 * 1024,
"application/msword", false);
testTruncatedDetection("testEXCEL.xlsx", 1024 * 1024,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", false);
}
private void testTruncatedDetection(String fileName, int maxBytes,
String expectedType, boolean expectTruncated)
throws Exception {
try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
Metadata metadata = new Metadata();
try (TikaInputStream tis = DetectHelper.getStreamForDetectionOnly(is, maxBytes, metadata)) {
MediaType detected = detector.detect(tis, metadata, new ParseContext());
assertEquals(MediaType.parse(expectedType), detected,
"Detection failed for " + fileName + " with maxBytes=" + maxBytes);
assertEquals(expectTruncated,
DetectHelper.isContentTruncatedForDetection(metadata),
"Truncation flag mismatch for " + fileName + " with maxBytes=" + maxBytes);
}
}
}
/**
* Tests that detectors can use the truncation flag to adjust behavior.
* When content is truncated, detectors may return a less specific type
* since they can't read the full file structure.
*/
@Test
public void testTruncationFlagInMetadata() throws Exception {
// Create a truncated stream and verify the flag is set
try (InputStream is = getResourceAsStream("/test-documents/testWORD.doc")) {
Metadata metadata = new Metadata();
try (TikaInputStream tis = DetectHelper.getStreamForDetectionOnly(is, 512, metadata)) {
// The flag should be set since the file is larger than 512 bytes
assertTrue(DetectHelper.isContentTruncatedForDetection(metadata),
"Expected truncation flag to be set for small buffer");
}
}
// Create a non-truncated stream (buffer larger than file) and verify flag is NOT set
try (InputStream is = getResourceAsStream("/test-documents/testTXT.txt")) {
// testTXT.txt is small, so 100KB should be more than enough
Metadata metadata = new Metadata();
try (TikaInputStream tis = DetectHelper.getStreamForDetectionOnly(is, 100 * 1024, metadata)) {
// The flag should NOT be set since we read the whole file
assertEquals(false, DetectHelper.isContentTruncatedForDetection(metadata),
"Expected truncation flag to NOT be set for full file");
}
}
}
}