OOXMLParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.text.DecimalFormatSymbols;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.util.LocaleUtil;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.OfficeParserTest;

public class OOXMLParserTest extends MultiThreadedTikaTest {

    private static Locale USER_LOCALE = null;

    @BeforeAll
    public static void setUp() {
        USER_LOCALE = LocaleUtil.getUserLocale();
    }

    @AfterAll
    public static void tearDown() {
        LocaleUtil.setUserLocale(USER_LOCALE);
        Locale.setDefault(USER_LOCALE);
    }

    @BeforeEach
    public void beforeEach() {
        LocaleUtil.setUserLocale(Locale.US);
        Locale.setDefault(Locale.US);
    }

    @Test
    public void testExcel() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);

        String content = getText("testEXCEL.xlsx", metadata, context);

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));

        assertContains("Sample Excel Worksheet", content);
        assertContains("Numbers and their Squares", content);
        assertContains("9", content);
        assertNotContained("9.0", content);
        assertContains("196", content);
        assertNotContained("196.0", content);
        assertEquals("false", metadata.get(Office.PROTECTED_WORKSHEET));

    }

    @Test
    public void testExcelFormats() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);

        String content = getText("testEXCEL-formats.xlsx", metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));

        // Number #,##0.00
        assertContains("1,599.99", content);
        assertContains("-1,599.99", content);

        // Currency $#,##0.00;[Red]($#,##0.00)
        assertContains("$1,599.99", content);
        assertContains("$1,599.99)", content);

        // Scientific 0.00E+00
        // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
        assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
        assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));

        // Percentage
        assertContains("2.50%", content);
        // Excel rounds up to 3%, but that requires Java 1.6 or later
        if (System.getProperty("java.version").startsWith("1.5")) {
            assertContains("2%", content);
        } else {
            assertContains("3%", content);
        }

        // Time Format: h:mm
        assertContains("6:15", content);
        assertContains("18:15", content);

        // Date Format: d-mmm-yy
        assertContains("17-May-07", content);

        // Currency $#,##0.00;[Red]($#,##0.00)
        assertContains("$1,599.99", content);
        assertContains("($1,599.99)", content);

        // Fraction (2.5): # ?/?
        assertContains("2 1/2", content);

        // Below assertions represent outstanding formatting issues to be addressed
        // they are included to allow the issues to be progressed with the Apache POI
        // team - See TIKA-103.

        /*************************************************************************
         // Date Format: m/d/yy
         assertContains("03/10/2009", content);

         // Date/Time Format
         assertContains("19/01/2008 04:35", content);

         // Custom Number (0 "dollars and" .00 "cents")
         assertContains("19 dollars and .99 cents", content);

         // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
         assertContains("At 4:20 AM on Thursday May 17, 2007", content);
         **************************************************************************/

    }

    @Test
    @Disabled("OOXML-Strict not currently supported by POI, see #57699")
    public void testExcelStrict() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);

        String content = getText("testEXCEL.strict.xlsx", metadata, context);

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Spreadsheet", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Spreadsheet for testing", metadata.get(TikaCoreProperties.DESCRIPTION));
        assertContains("Test spreadsheet", content);
        assertContains("This one is red", content);
        assertContains("cb=10", content);
        assertNotContained("10.0", content);
        assertContains("cb=sum", content);
        assertNotContained("13.0", content);
        assertEquals("false", metadata.get(Office.PROTECTED_WORKSHEET));

    }

    /**
     * Documents with some sheets are protected, but not all.
     * See TIKA-364.
     */
    @Test
    public void testProtectedExcelSheets() throws Exception {

        Metadata metadata = getXML("protectedSheets.xlsx").metadata;

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));

        assertEquals("true", metadata.get(Office.PROTECTED_WORKSHEET));

    }

    /**
     * An excel document which is password protected.
     * See TIKA-437.
     */
    @Test
    public void testProtectedExcelFile() throws Exception {
        XMLResult xmlResult = getXML("protectedFile.xlsx");

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                xmlResult.metadata.get(Metadata.CONTENT_TYPE));

        assertEquals("true", xmlResult.metadata.get(Office.PROTECTED_WORKSHEET));

        assertContains("Office", xmlResult.xml);
    }

    /**
     * Ensures that custom OOXML properties are extracted
     */
    @Test
    public void testExcelCustomProperties() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        getXML("testEXCEL_custom_props.xlsx", metadata, context);

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.CREATOR));
        assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
        assertEquals("true", metadata.get("custom:myCustomBoolean"));
        assertEquals("3", metadata.get("custom:myCustomNumber"));
        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
    }

    //TIKA-1100:
    @Test
    public void testExcelTextBox() throws Exception {
        XMLResult r = getXML("testEXCEL_textbox.xlsx");
        assertContains("some autoshape", r.xml);
    }

    //TIKA-2346
    @Test
    public void testTurningOffTextBoxExtractionExcel() throws Exception {

        ParseContext pc = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setIncludeShapeBasedContent(false);
        pc.set(OfficeParserConfig.class, officeParserConfig);
        String xml = getXML("testEXCEL_textbox.xlsx", pc).xml;
        assertNotContained("autoshape", xml);
    }

    @Test
    public void testXLSXThumbnail() throws Exception {
        String xml = getXML("testXLSX_Thumbnail.xlsx").xml;
        int a = xml.indexOf("This file contains an embedded thumbnail by default");
        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.wmf\" />");

        assertTrue(a != -1);
        assertTrue(b != -1);
        assertTrue(a < b);
    }

    @Test
    public void testEncrypted() throws Exception {
        Map<String, String> tests = new HashMap<>();
        //the first three contain javax.crypto.CipherInputStream
        tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
        tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
        tests.put("testEXCEL_protected_passtika.xlsx", "This is an Encrypted Excel spreadsheet.");
        //TIKA-2873 this one contains a ChunkedCipherInputStream
        //that is buggy at the POI level...can unwrap TikaInputStream in OfficeParser
        //once https://bz.apache.org/bugzilla/show_bug.cgi?id=63431 is fixed.
        tests.put("testEXCEL_protected_passtika_2.xlsx",
                "This is an Encrypted Excel spreadsheet with a ChunkedCipherInputStream.");

        PasswordProvider passwordProvider = new PasswordProvider() {
            @Override
            public String getPassword(Metadata metadata) {
                return "tika";
            }
        };
        ParseContext passwordContext = new ParseContext();
        passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);

        for (Map.Entry<String, String> e : tests.entrySet()) {
            XMLResult xmlResult = getXML(e.getKey(), passwordContext);
            assertContains(e.getValue(), xmlResult.xml);
        }

        ParseContext context = new ParseContext();
        //now try with no password
        for (Map.Entry<String, String> e : tests.entrySet()) {
            boolean exc = false;
            try {
                getXML(e.getKey());
            } catch (EncryptedDocumentException ex) {
                exc = true;
            }
            assertTrue(exc);
        }
    }

    @Test
    public void testExcelHeaderAndFooterExtraction() throws Exception {
        XMLResult xml = getXML("testEXCEL_headers_footers.xlsx");

        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                xml.metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Internal spreadsheet", xml.metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Aeham Abushwashi", xml.metadata.get(TikaCoreProperties.CREATOR));

        String content = xml.xml;
        assertContains("John Smith1", content);
        assertContains("John Smith50", content);
        assertContains("1 Corporate HQ", content);
        assertContains("Header - Corporate Spreadsheet", content);
        assertContains("Header - For Internal Use Only", content);
        assertContains("Header - Author: John Smith", content);
        assertContains("Footer - Corporate Spreadsheet", content);
        assertContains("Footer - For Internal Use Only", content);
        assertContains("Footer - Author: John Smith", content);
    }

    @Test
    public void testExcelHeaderAndFooterNotExtraction() throws Exception {
        ParseContext parseContext = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setIncludeHeadersAndFooters(false);
        parseContext.set(OfficeParserConfig.class, officeParserConfig);

        String content = getXML("testEXCEL_headers_footers.xlsx", parseContext).xml;
        assertNotContained("Header - Corporate Spreadsheet", content);
        assertNotContained("Header - For Internal Use Only", content);
        assertNotContained("Header - Author: John Smith", content);
        assertNotContained("Footer - Corporate Spreadsheet", content);
        assertNotContained("Footer - For Internal Use Only", content);
        assertNotContained("Footer - Author: John Smith", content);

        //now test configuration via tika-config
        Parser configuredParser = TikaLoader.load(
                getConfigPath(OfficeParserTest.class, "tika-config-headers-footers.json"))
                .loadAutoDetectParser();
        content = getXML("testEXCEL_headers_footers.xlsx", configuredParser).xml;
        assertContains("John Smith1", content);
        assertContains("John Smith50", content);
        assertContains("1 Corporate HQ", content);
        assertNotContained("Header - Corporate Spreadsheet", content);
        assertNotContained("Header - For Internal Use Only", content);
        assertNotContained("Header - Author: John Smith", content);
        assertNotContained("Footer - Corporate Spreadsheet", content);
        assertNotContained("Footer - For Internal Use Only", content);
        assertNotContained("Footer - Author: John Smith", content);
    }


    @Test
    public void testHyperlinksInXLSX() throws Exception {
        String xml = getXML("testEXCEL_hyperlinks.xlsx").xml;
        //external url
        assertContains("<a href=\"http://tika.apache.org/\">", xml);
        //mail url
        assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
        //external linked file
        assertContains("<a href=\"linked_file.txt.htm\">", xml);
        //link on textbox
        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
    }


    @Test
    public void testBigIntegersWGeneralFormat() throws Exception {
        //TIKA-2025
        String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
        assertContains("123456789012345", xml);//15 digit number
        assertContains("123456789012346", xml);//15 digit formula
        Locale locale = LocaleUtil.getUserLocale();

        DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
        //16 digit number is treated as scientific notation as is the 16 digit formula
        assertContains("1" + symbols.getDecimalSeparator() + "23456789012345E+15</td>\t" + "<td>1" +
                symbols.getDecimalSeparator() + "23456789012345E+15", xml);
    }

    @Test
    public void testBigIntegersWGeneralFormatWLocaleIT() throws Exception {
        LocaleUtil.setUserLocale(Locale.ITALIAN);
        //TIKA-2438
        try {
            String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
            assertContains("123456789012345", xml);//15 digit number
            assertContains("123456789012346", xml);//15 digit formula
            Locale locale = LocaleUtil.getUserLocale();

            DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
            //16 digit number is treated as scientific notation as is the 16 digit formula
            assertContains(
                    "1" + symbols.getDecimalSeparator() + "23456789012345E+15</td>\t" + "<td>1" +
                            symbols.getDecimalSeparator() + "23456789012345E+15", xml);
        } finally {
            LocaleUtil.setUserLocale(USER_LOCALE);
        }
    }


    @Test
    public void testMacroinXlsm() throws Exception {

        //test default is "don't extract macros"
        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm");
        for (Metadata metadata : metadataList) {
            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
                fail("Shouldn't have extracted macros as default");
            }
        }
        assertEquals("ThisWorkbook", metadataList.get(0).get(Office.WORKBOOK_CODENAME));

        //now test that they were extracted
        ParseContext context = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setExtractMacros(true);
        context.set(OfficeParserConfig.class, officeParserConfig);

        Metadata minExpected = new Metadata();
        minExpected.add(TikaCoreProperties.TIKA_CONTENT.getName(), "Sub Dirty()");
        minExpected.add(TikaCoreProperties.TIKA_CONTENT.getName(), "dirty dirt dirt");
        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

        assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", context));

        //test configuring via config file
        Parser parser = TikaLoader.load(
                getConfigPath(OOXMLParserTest.class, "tika-config-dom-macros.json"))
                .loadAutoDetectParser();
        assertContainsAtLeast(minExpected,
                getRecursiveMetadata("testEXCEL_macro.xlsm", parser));
    }

    @Test
    public void testExcelXLSB() throws Exception {
        Detector detector = new DefaultDetector();

        Metadata m = new Metadata();
        m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "excel.xlsb");

        // Should be detected correctly
        MediaType type;
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
            type = detector.detect(tis, m, new ParseContext());
            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
        }

        // OfficeParser won't handle it
        assertEquals(false,
                (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));

        // OOXMLParser will (soon) handle it
        assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));

        // AutoDetectParser doesn't break on it
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        String content = getText("testEXCEL.xlsb", new Metadata(), context);
        assertContains("This is an example spreadsheet", content);
    }

    @Test
    public void testXLSBVarious() throws Exception {
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setExtractMacros(true);
        ParseContext parseContext = new ParseContext();
        parseContext.set(OfficeParserConfig.class, officeParserConfig);
        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
        assertEquals(4, metadataList.size());

        String xml = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
        assertContains("<td>13</td>", xml);
        assertContains("<td>13.1211231321</td>", xml);
        assertContains("<td>$   3.03</td>", xml);
        assertContains("<td>20%</td>", xml);
        assertContains("<td>13.12</td>", xml);
        assertContains("<td>123456789012345</td>", xml);
        assertContains("<td>1.23456789012345E+15</td>", xml);
        assertContains("test comment2", xml);

        assertContains("comment4 (end of row)", xml);


        assertContains("<td>1/4</td>", xml);
        assertContains("<td>3/9/17</td>", xml);
        assertContains("<td>4</td>", xml);
        assertContains("<td>2</td>", xml);

        assertContains("<td>   46/1963</td>", xml);
        assertContains("<td>  3/128</td>", xml);
        assertContains("test textbox", xml);

        assertContains("test WordArt", xml);

        assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
        assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);

        assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
        assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);

        assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
        assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
        assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
        assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);

    }

    @Test
    public void testXLSBNoHeaderFooters() throws Exception {
        ParseContext parseContext = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setIncludeHeadersAndFooters(false);
        parseContext.set(OfficeParserConfig.class, officeParserConfig);
        String xml = getXML("testEXCEL_various.xlsb", parseContext).xml;
        assertNotContained("OddLeftHeader OddCenterHeader OddRightHeader", xml);
        assertNotContained("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);

        assertNotContained("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
        assertNotContained("OddLeftFooter OddCenterFooter OddRightFooter", xml);
        assertNotContained("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
        assertNotContained("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);

    }

    @Test
    public void testPOI61034() throws Exception {
        //tests temporary work around until POI 3.17-beta1 is released
        XMLResult r = getXML("testEXCEL_poi-61034.xlsx");
        Matcher m = Pattern.compile("<h1>(Sheet\\d+)</h1>").matcher(r.xml);
        Set<String> seen = new HashSet<>();
        while (m.find()) {
            String sheetName = m.group(1);
            if (seen.contains(sheetName)) {
                fail("Should only see each sheet once: " + sheetName);
            }
            seen.add(sheetName);
        }

    }

    @Test
    public void testXLSBOriginalPath() throws Exception {
        assertEquals("C:\\Users\\tallison\\Desktop\\working\\TIKA-1945\\",
                getXML("testEXCEL_diagramData.xlsb").metadata
                        .get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
    }

    @Test
    public void testXLSXOriginalPath() throws Exception {
        assertEquals("C:\\Users\\tallison\\Desktop\\working\\TIKA-1945\\",
                getXML("testEXCEL_diagramData.xlsx").metadata
                        .get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
    }

    @Test
    public void testXLSBDiagramData() throws Exception {
        assertContains("SmartArt", getXML("testEXCEL_diagramData.xlsb").xml);
    }

    @Test
    public void testXLSXDiagramData() throws Exception {
        assertContains("SmartArt", getXML("testEXCEL_diagramData.xlsx").xml);
    }

    @Test
    public void testXLSXChartData() throws Exception {
        String xml = getXML("testEXCEL_charts.xlsx").xml;
        assertContains("peach", xml);
        assertContains("March\tApril", xml);
        assertNotContained("chartSpace", xml);
    }

    @Test
    public void testXLSBChartData() throws Exception {
        String xml = getXML("testEXCEL_charts.xlsb").xml;
        assertContains("peach", xml);
        assertContains("March\tApril", xml);
        assertNotContained("chartSpace", xml);
    }

    @Test
    public void testXLSXPhoneticStrings() throws Exception {
        //This unit test and test file come from Apache POI 51519.xlsx

        //test default concatenates = true
        assertContains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
                getXML("testEXCEL_phonetic.xlsx").xml);

        //test turning it off
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setConcatenatePhoneticRuns(false);
        ParseContext pc = new ParseContext();
        pc.set(OfficeParserConfig.class, officeParserConfig);
        assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
                getXML("testEXCEL_phonetic.xlsx", pc).xml);


        //test configuring via config file
        Parser parser = TikaLoader.load(
                getConfigPath(OfficeParserTest.class, "tika-config-exclude-phonetic.json"))
                .loadAutoDetectParser();
        assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
                getXML("testEXCEL_phonetic.xlsx", parser).xml);

    }

    @Test
    public void testEmbeddedXLSInOLEObject() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx");
        assertEquals(4, metadataList.size());
        Metadata xlsx = metadataList.get(2);
        assertContains("<h1>Sheet1</h1>", xlsx.get(TikaCoreProperties.TIKA_CONTENT));
        assertContains("<td>1</td>", xlsx.get(TikaCoreProperties.TIKA_CONTENT));
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                xlsx.get(Metadata.CONTENT_TYPE));
    }


    @Test
    public void testSigned() throws Exception {
        Metadata m = getXML("testWORD_signed.docx").metadata;
        assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));

        m = getXML("testEXCEL_signed.xlsx").metadata;
        assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));

        m = getXML("testPPT_signed.pptx").metadata;
        assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));

    }

    @Test
    public void testDateFormat() throws Exception {
        Parser p = TikaLoader.load(
                getConfigPath(OOXMLParserTest.class, "tika-config-custom-date-override.json"))
                .loadAutoDetectParser();
        String xml = getXML("testEXCEL_dateFormats.xlsx", p).xml;
        assertContains("2018-09-20", xml);
        assertContains("1996-08-10", xml);
    }

    @Test
    public void testDocSecurity() throws Exception {
        assertEquals(OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED,
                getRecursiveMetadata("protectedFile.xlsx").get(0)
                        .get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
        assertEquals(OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED,
                getRecursiveMetadata("testWORD_docSecurity.docx").get(0)
                        .get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
    }

    @Test
    public void testMultiThreaded() throws Exception {
        //TIKA-3627
        int numThreads = 5;
        int numIterations = 5;
        ParseContext[] parseContexts = new ParseContext[numThreads];
        for (int i = 0; i < parseContexts.length; i++) {
            parseContexts[i] = new ParseContext();
        }
        Set<String> extensions = new HashSet<>();
        extensions.add(".pptx");
        extensions.add(".docx");
        extensions.add(".xlsx");
        extensions.add(".ppt");
        extensions.add(".doc");
        extensions.add(".xls");
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
        testMultiThreaded(wrapper, parseContexts, numThreads, numIterations, path -> {
            String pathName = path.getName().toLowerCase(Locale.ENGLISH);
            if (pathName.equalsIgnoreCase("testRecordSizeExceeded.xlsx")) {
                return false;
            }
            int i = pathName.lastIndexOf(".");
            String ext = "";
            if (i > -1) {
                ext = pathName.substring(i);
            }
            return extensions.contains(ext);
        });

    }

    @Test
    public void testNoRecordSizeOverflow() throws Exception {
        //TIKA-4474 -- test: files (passed as stream) no longer have limit on record size as they are spooled
        String content = getText("testRecordSizeExceeded.xlsx");
        assertContains("Repetitive content pattern 3 for compression test row 1", content);
    }

}