TestOldExcelExtractor.java

/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.hssf.extractor;

import static org.apache.poi.POITestCase.assertContains;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.security.Permission;

import org.apache.commons.io.output.NullPrintStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.EmptyFileException;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDataSamples;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.RecordFormatException;
import org.apache.poi.util.SuppressForbidden;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

/**
 * Unit tests for the Excel 5/95 and Excel 4 (and older) text
 *  extractor
 */
@SuppressForbidden("tests java.security features deprecated in java 17 - no other option though")
final class TestOldExcelExtractor {
    private static OldExcelExtractor createExtractor(String sampleFileName) throws IOException {
        File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
        return new OldExcelExtractor(file);
    }

    @Test
    void testSimpleExcel3() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {

            // Check we can call getText without error
            String text = extractor.getText();

            // Check we find a few words we expect in there
            assertContains(text, "Season beginning August");
            assertContains(text, "USDA");

            // Check we find a few numbers we expect in there
            assertContains(text, "347");
            assertContains(text, "228");

            // Check we find a few string-literal dates in there
            assertContains(text, "1981/82");

            // Check the type
            assertEquals(3, extractor.getBiffVersion());
            assertEquals(0x10, extractor.getFileType());

        }
    }


    @Test
    void testSimpleExcel3NoReading() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
            assertNotNull(extractor);
        }
    }

    @Test
    void testSimpleExcel4() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {

            // Check we can call getText without error
            String text = extractor.getText();

            // Check we find a few words we expect in there
            assertContains(text, "Size");
            assertContains(text, "Returns");

            // Check we find a few numbers we expect in there
            assertContains(text, "11");
            assertContains(text, "784");

            // Check the type
            assertEquals(4, extractor.getBiffVersion());
            assertEquals(0x10, extractor.getFileType());

        }
    }

    @Test
    void testSimpleExcel5() throws IOException {
        for (String ver : new String[] {"5", "95"}) {
            try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {

                // Check we can call getText without error
                String text = extractor.getText();

                // Check we find a few words we expect in there
                assertContains(text, "Sample Excel");
                assertContains(text, "Written and saved");

                // Check we find a few numbers we expect in there
                assertContains(text, "15");
                assertContains(text, "169");

                // Check we got the sheet names (new formats only)
                assertContains(text, "Sheet: Feuil3");

                // Check the type
                assertEquals(5, extractor.getBiffVersion());
                assertEquals(0x05, extractor.getFileType());

            }
        }
    }

    @Test
    void testStrings() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
            String text = extractor.getText();

            // Simple strings
            assertContains(text, "Table 10 -- Examination Coverage:");
            assertContains(text, "Recommended and Average Recommended Additional Tax After");
            assertContains(text, "Individual income tax returns, total");

            // More complicated strings
            assertContains(text, "$100,000 or more");
            assertContains(text, "S corporation returns, Form 1120S [10,15]");
            assertContains(text, "individual income tax return \u201Cshort forms.\u201D");

            // Formula based strings
            // TODO Find some then test
        }
    }

    @Test
    void testFormattedNumbersExcel4() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
            String text = extractor.getText();

            // Simple numbers
            assertContains(text, "151");
            assertContains(text, "784");

            // Numbers which come from formulas
            assertContains(text, "0.398"); // TODO Rounding
            assertContains(text, "624");

            // Formatted numbers
            // TODO
            // assertContains(text, "55,624");
            // assertContains(text, "11,743,477");
        }
    }

    @Test
    void testFormattedNumbersExcel5() throws IOException {
        for (String ver : new String[] {"5", "95"}) {
            try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {
                String text = extractor.getText();

                // Simple numbers
                assertContains(text, "1");

                // Numbers which come from formulas
                assertContains(text, "13");
                assertContains(text, "169");

                // Formatted numbers
                // TODO
                // assertContains(text, "100.00%");
                // assertContains(text, "155.00%");
                // assertContains(text, "1,125");
                // assertContains(text, "189,945");
                // assertContains(text, "1,234,500");
                // assertContains(text, "$169.00");
                // assertContains(text, "$1,253.82");
            }
        }
    }

    @Test
    void testFromFile() throws IOException {
        for (String ver : new String[] {"4", "5", "95"}) {
            String filename = "testEXCEL_"+ver+".xls";
            File f = HSSFTestDataSamples.getSampleFile(filename);

            try (OldExcelExtractor extractor = new OldExcelExtractor(f)) {
                String text = extractor.getText();
                assertNotNull(text);
                assertTrue(text.length() > 100);
            }
        }
    }

    @Test
    void testFromInputStream() throws IOException {
        for (String ver : new String[] {"4", "5", "95"}) {
            String filename = "testEXCEL_"+ver+".xls";
            File f = HSSFTestDataSamples.getSampleFile(filename);

            try (InputStream stream = new FileInputStream(f);
                 OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
                String text = extractor.getText();
                assertNotNull(text);
                assertTrue(text.length() > 100);
            }
        }
    }

    @Test
    void testOpenInvalidFile1() throws IOException {
        // a file that exists, but is a different format
        assertThrows(OfficeXmlFileException.class, () -> createExtractor("WithVariousData.xlsx").close());

        // a completely different type of file
        assertThrows(RecordFormatException.class, () -> createExtractor("48936-strings.txt").close());

        // a POIFS file which is not a Workbook
        try (InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("47304.doc")) {
            assertThrows(FileNotFoundException.class, () -> new OldExcelExtractor(is).close());
        }
    }

    @Test
    void testOpenNonExistingFile() {
        // a file that exists, but is a different format
        assertThrows(EmptyFileException.class, () -> new OldExcelExtractor(new File("notexistingfile.xls")).close());
    }

    @Test
    void testInputStream() throws IOException {
        File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
        try (InputStream stream = new FileInputStream(file);
             OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
            String text = extractor.getText();
            assertNotNull(text);
        }
    }

    @Test
    void testInputStreamNPOIHeader() throws IOException {
        //TODO: the worksheet names are currently mangled.  They're treated
        //as if UTF-16, but they're just ascii.  Need to fix this.
        //Is it possible that the leading 0 byte in the worksheet name is a signal
        //that these worksheet names should be interpreted as ascii/1252?
        File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
        try (InputStream stream = new FileInputStream(file);
             OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
            String text = extractor.getText();
            assertNotNull(text);
        }
    }

    @Test
    void testPOIFSFileSystem() throws IOException {
        File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
        try (POIFSFileSystem fs = new POIFSFileSystem(file);
            OldExcelExtractor extractor = new OldExcelExtractor(fs)){
            String text = extractor.getText();
            assertNotNull(text);
        }
    }

    @Test
    void testDirectoryNode() throws IOException {
        File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
        try (POIFSFileSystem fs = new POIFSFileSystem(file);
             OldExcelExtractor extractor = new OldExcelExtractor(fs.getRoot())) {
            String text = extractor.getText();
            assertNotNull(text);
        }
    }

    @Test
    void testDirectoryNodeInvalidFile() throws IOException {
        File file = POIDataSamples.getDocumentInstance().getFile("test.doc");
        try (POIFSFileSystem fs = new POIFSFileSystem(file)) {
             assertThrows(FileNotFoundException.class, () -> new OldExcelExtractor(fs.getRoot()));
        }
    }

    @Test
    @Disabled("Redirecting System.out/err is bad in concurrent test environment")
    void testMainUsage() {
        PrintStream save = System.err;
        SecurityManager sm = System.getSecurityManager();
        System.setSecurityManager(new NoExitSecurityManager());
        try {
            System.setErr(NullPrintStream.INSTANCE);
            // calls System.exit()
            assertThrows(ExitException.class, () -> OldExcelExtractor.main(new String[]{}));
        } finally {
            System.setSecurityManager(sm);
            System.setErr(save);
        }
    }

    @Test
    @Disabled("Redirecting System.out/err is bad in concurrent test environment")
    void testMain() throws IOException {
        File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
        PrintStream save = System.out;
        try (UnsynchronizedByteArrayOutputStream out = UnsynchronizedByteArrayOutputStream.builder().get();
             PrintStream str = new PrintStream(out, false, StandardCharsets.UTF_8.displayName(LocaleUtil.getUserLocale()))) {
            System.setOut(str);
            OldExcelExtractor.main(new String[] {file.getAbsolutePath()});
            String string = out.toString(StandardCharsets.UTF_8);
            assertTrue(string.contains("Table C-13--Lemons"), "Had: " + string);
        } finally {
            System.setOut(save);
        }
    }

    @Test
    void testEncryptionException() throws IOException {
        //test file derives from Common Crawl
        File file = HSSFTestDataSamples.getSampleFile("60284.xls");

        try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
            assertEquals(5, ex.getBiffVersion());
            assertEquals(5, ex.getFileType());
            assertThrows(EncryptedDocumentException.class, ex::getText);
        }
    }

    @Test
    void testSheetWithNoName() throws IOException {
        File file = HSSFTestDataSamples.getSampleFile("64130.xls");

        try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
            assertEquals(5, ex.getBiffVersion());
            assertEquals(5, ex.getFileType());
            assertContains(ex.getText(), "Dawn");
        }
    }

    @SuppressForbidden("tests java.security features deprecated in java 17 - no other option though")
    private static class NoExitSecurityManager extends SecurityManager {
        @Override
        public void checkPermission(Permission perm) {
            // allow anything.
        }
        @Override
        public void checkPermission(Permission perm, Object context) {
            // allow anything.
        }
        @Override
        public void checkExit(int status) {
            super.checkExit(status);
            throw new ExitException(status);
        }
    }

    private static class ExitException extends SecurityException {
        public final int status;
        public ExitException(int status) {
            super("There is no escape!");
            this.status = status;
        }
    }

    @Test
    void testMetaData() throws IOException {
        try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
            POITextExtractor metaData = extractor.getMetadataTextExtractor();
            assertNotNull(metaData);

            assertThrows(IllegalStateException.class, metaData::getMetadataTextExtractor);
            assertEquals("", metaData.getText());
            assertNotNull(metaData.getDocument());
            assertTrue(metaData.isCloseFilesystem());
            assertNotNull(metaData.getFilesystem());

            // the setter is a NOP
            metaData.setCloseFilesystem(false);
            assertTrue(metaData.isCloseFilesystem());

            metaData.close();
        }
    }
}