TabularFormatsTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;


import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.text.DateFormatSymbols;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;

import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.TikaTest;

/**
 * Ensure that our various Table-based formats produce consistent,
 * broadly similar output.
 * This is mostly focused on the XHTML output
 */
public class TabularFormatsTest extends TikaTest {
    protected static final String[] columnNames =
            new String[]{"recnum", "square", "desc", "pctdone", "pctincr", "date", "datetime",
                    "time"};
    protected static final String[] columnLabels =
            new String[]{"Record Number", "Square of the Record Number", "Description of the Row",
                    "Percent Done", "Percent Increment", "date", "datetime", "time"};
    // Which columns hold percentages? Not all parsers
    //  correctly format these...
    protected static final List<Integer> percentageColumns = Arrays.asList(3, 4);

    private static final Logger LOG = LoggerFactory.getLogger(TabularFormatsTest.class);
    // to prevent this build test from failing outside the english speaking world, we need to have
    // both local and english month names (testCSV uses english names, the other tests local names)
    private static String[] SHORT_MONTHS_EXPR;

    static {
        String[] shortMonthsEnglish = new DateFormatSymbols(Locale.ENGLISH).getShortMonths();
        String[] shortMonthsLocal = new DateFormatSymbols(Locale.getDefault()).getShortMonths();
        List<String> shortMonthsExpr = new ArrayList();
        for (int i = 0; i < 12; ++i) {
            String expr =
                    shortMonthsEnglish[i].toUpperCase(Locale.ENGLISH) + "|" + shortMonthsEnglish[i];
            if (!shortMonthsEnglish[i].equals(shortMonthsLocal[i])) {
                expr += "|" + shortMonthsLocal[i].toUpperCase(Locale.getDefault()) + "|" +
                        shortMonthsLocal[i];
            }
            LOG.info(expr);
            shortMonthsExpr.add(expr);
        }
        SHORT_MONTHS_EXPR = shortMonthsExpr.toArray(new String[0]);
    }

    /**
     * Expected values, by <em>column</em>
     */
    protected static final Object[][] table =
            new Object[][]{new String[]{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
                    new String[]{"0", "1", "4", "9", "16", "25", "36", "49", "64", "81", "100"},
                    new String[]{}, // Generated later
                    new Pattern[]{Pattern.compile("0%|0.00%"), Pattern.compile("10%|10.00%"),
                            Pattern.compile("20%|20.00%"), Pattern.compile("30%|30.00%"),
                            Pattern.compile("40%|40.00%"), Pattern.compile("50%|50.00%"),
                            Pattern.compile("60%|60.00%"), Pattern.compile("70%|70.00%"),
                            Pattern.compile("80%|80.00%"), Pattern.compile("90%|90.00%"),
                            Pattern.compile("100%|100.00%"),},
                    new Pattern[]{Pattern.compile(""), Pattern.compile("0.0%|0.00%"),
                            Pattern.compile("50.0%|50.00%"), Pattern.compile("66.7%|66.67%"),
                            Pattern.compile("75.0%|75.00%"), Pattern.compile("80.0%|80.00%"),
                            Pattern.compile("83.3%|83.33%"), Pattern.compile("85.7%|85.71%"),
                            Pattern.compile("87.5%|87.50%"), Pattern.compile("88.9%|88.89%"),
                            Pattern.compile("90.0%|90.00%"),},
                    new Pattern[]{Pattern.compile("0?1-01-1960"), Pattern.compile("0?2-01-1960"),
                            Pattern.compile("17-01-1960"), Pattern.compile("22-03-1960"),
                            Pattern.compile("13-09-1960"), Pattern.compile("17-09-1961"),
                            Pattern.compile("20-07-1963"), Pattern.compile("29-07-1966"),
                            Pattern.compile("20-03-1971"), Pattern.compile("18-12-1977"),
                            Pattern.compile("19-05-1987"),}, new Pattern[]{Pattern.compile(
                    "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:00:01(.00)?"),
                    Pattern.compile(
                            "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:00:10(.00)?"),
                    Pattern.compile(
                            "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:01:40(.00)?"),
                    Pattern.compile(
                            "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]00:16:40(.00)?"),
                    Pattern.compile(
                            "01(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]02:46:40(.00)?"),
                    Pattern.compile(
                            "02(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]03:46:40(.00)?"),
                    Pattern.compile(
                            "12(" + SHORT_MONTHS_EXPR[0] + ")(60|1960)[:\\s]13:46:40(.00)?"),
                    Pattern.compile(
                            "25(" + SHORT_MONTHS_EXPR[3] + ")(60|1960)[:\\s]17:46:40(.00)?"),
                    Pattern.compile(
                            "03(" + SHORT_MONTHS_EXPR[2] + ")(63|1963)[:\\s]09:46:40(.00)?"),
                    Pattern.compile(
                            "09(" + SHORT_MONTHS_EXPR[8] + ")(91|1991)[:\\s]01:46:40(.00)?"),
                    Pattern.compile(
                            "19(" + SHORT_MONTHS_EXPR[10] + ")(76|2276)[:\\s]17:46:40(.00)?")},
                    new Pattern[]{Pattern.compile("0?0:00:01(.\\d\\d)?"),
                            Pattern.compile("0?0:00:03(.\\d\\d)?"),
                            Pattern.compile("0?0:00:09(.\\d\\d)?"),
                            Pattern.compile("0?0:00:27(.\\d\\d)?"),
                            Pattern.compile("0?0:01:21(.\\d\\d)?"),
                            Pattern.compile("0?0:04:03(.\\d\\d)?"),
                            Pattern.compile("0?0:12:09(.\\d\\d)?"),
                            Pattern.compile("0?0:36:27(.\\d\\d)?"),
                            Pattern.compile("0?1:49:21(.\\d\\d)?"),
                            Pattern.compile("0?5:28:03(.\\d\\d)?"),
                            Pattern.compile("16:24:09(.\\d\\d)?")}};


    static {
        // Row text in 3rd column
        table[2] = new String[table[0].length];
        for (int i = 0; i < table[0].length; i++) {
            table[2][i] = "This is row " + i + " of 10";
        }
    }

    protected static String[] toCells(String row, boolean isTH) {
        // Split into cells, ignoring stuff before first cell
        String[] cells;
        if (isTH) {
            cells = row.split("<th");
        } else {
            cells = row.split("<td");
        }
        cells = Arrays.copyOfRange(cells, 1, cells.length);

        // Ignore the closing tag onwards, and normalise whitespace
        for (int i = 0; i < cells.length; i++) {
            cells[i] = cells[i].trim();
            if (cells[i].equals("/>")) {
                cells[i] = "";
                continue;
            }

            int splitAt = cells[i].lastIndexOf("</");
            cells[i] = cells[i].substring(0, splitAt).trim();
            cells[i] = cells[i].replaceAll("\\s+", " ");
        }
        return cells;
    }

    protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
        // Find the first row
        int splitAt = xml.indexOf("</tr>");
        String hRow = xml.substring(0, splitAt);
        splitAt = xml.indexOf("<tr>");
        hRow = hRow.substring(splitAt + 4);

        // Split into cells, ignoring stuff before first cell
        String[] cells = toCells(hRow, isTH);

        // Check we got the right number
        assertEquals(columnLabels.length, cells.length,
                "Wrong number of cells in header row " + hRow);

        // Check we got the right stuff
        for (int i = 0; i < cells.length; i++) {
            if (hasLabel && hasName) {
                assertContains("title=\"" + columnNames[i] + "\"", cells[i]);
                assertContains(">" + columnLabels[i], cells[i]);
            } else if (hasName) {
                assertContains(">" + columnNames[i], cells[i]);
            } else {
                assertContains(">" + columnLabels[i], cells[i]);
            }
        }
    }

    protected void assertContents(String xml, boolean hasHeader, boolean doesPercents) {
        // Ignore anything before the first <tr>
        // Ignore the header row if there is one
        int ignores = 1;
        if (hasHeader) {
            ignores++;
        }

        // Split into rows, and discard the row closing (and anything after)
        String[] rows = xml.split("<tr>");
        rows = Arrays.copyOfRange(rows, ignores, rows.length);
        for (int i = 0; i < rows.length; i++) {
            rows[i] = rows[i].split("</tr>")[0].trim();
        }

        // Check we got the right number of rows
        for (int cn = 0; cn < table.length; cn++) {
            assertEquals(table[cn].length, rows.length,
                    "Wrong number of rows found compared to column " + (cn + 1));
        }

        // Check each row's values
        for (int rn = 0; rn < rows.length; rn++) {
            String[] cells = toCells(rows[rn], false);
            assertEquals(table.length, cells.length,
                    "Wrong number of values in row " + (rn + 1));

            for (int cn = 0; cn < table.length; cn++) {
                String val = cells[cn];

                // If the parser doesn't know about % formats,
                //  skip the cell if the column in a % one
                if (!doesPercents && percentageColumns.contains(cn)) {
                    continue;
                }

                // Ignore cell attributes
                if (!val.isEmpty()) {
                    val = val.split(">")[1];
                }
                // Check
                String error = "Wrong text in row " + (rn + 1) + " and column " + (cn + 1) + " - " +
                        table[cn][rn] + " vs " + val;
                if (table[cn][rn] instanceof String) {
                    assertEquals(table[cn][rn], val, error);
                } else {
                    assertTrue(((Pattern) table[cn][rn]).matcher(val).matches(), error);
                }
            }
        }
    }

    @Test
    public void testSAS7BDAT() throws Exception {
        XMLResult result = getXML("test-columnar.sas7bdat");
        String xml = result.xml;
        assertHeaders(xml, true, true, true);
        assertContents(xml, true, true);
    }

    @Test
    public void testXLS() throws Exception {
        XMLResult result = getXML("test-columnar.xls");
        String xml = result.xml;
        assertHeaders(xml, false, true, false);
        assertContents(xml, true, true);
    }

    @Test
    public void testXLSX() throws Exception {
        XMLResult result = getXML("test-columnar.xlsx");
        String xml = result.xml;
        assertHeaders(xml, false, true, false);
        assertContents(xml, true, true);
    }

    @Test
    public void testXLSB() throws Exception {
        XMLResult result = getXML("test-columnar.xlsb");
        String xml = result.xml;
        assertHeaders(xml, false, true, false);
        assertContents(xml, true, true);
    }

    // TODO Fix the ODS test - currently failing with
    // org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared
//    @Test
//    public void testODS() throws Exception {
//        XMLResult result = getXML("test-columnar.ods");
//        String xml = result.xml;
//        assertHeaders(xml, false, true, false);
//        assertContents(xml, true, true);
//    }

    // TODO Test other formats, eg Database formats

    /**
     * Note - we don't have a dedicated CSV parser
     * <p>
     * This means we don't get proper HTML out...
     */
    @Test
    public void testCSV() throws Exception {
        XMLResult result = getXML("test-columnar.csv");
        String xml = result.xml;
        // Normalise whitespace before testing
        xml = xml.replaceAll("\\s+", " ");

        for (String label : columnLabels) {
            assertContains(label, xml);
        }
        for (Object[] vals : table) {
            for (Object val : vals) {
                if (val instanceof String) {
                    assertContains((String) val, xml);
                } else if (val instanceof Pattern) {
                    assertTrue(((Pattern) val).matcher(xml).find(), "Not matched: " + val);
                }
            }
        }
    }
}