DBFParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dbf;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * This is a Tika wrapper around the DBFReader.
 * <p>
 * This reads many dbase3 file variants (not DBASE 7, yet!).
 * <p>
 * It caches the first 10 rows and then runs encoding dectection
 * on the "character" cells.
 */
@TikaComponent
public class DBFParser implements Parser {

    private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10;
    private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000;
    private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;

    private static final Set<MediaType> SUPPORTED_TYPES =
            Collections.singleton(MediaType.application("x-dbf"));

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        DBFReader reader = DBFReader.open(tis);
        DBFFileHeader header = reader.getHeader();
        metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString());

        //insert metadata here
        Calendar lastModified = header.getLastModified();
        if (lastModified != null) {
            metadata.set(TikaCoreProperties.MODIFIED, lastModified);
        }

        //buffer first X rows for charset detection
        List<DBFRow> firstRows = new LinkedList<>();
        DBFRow row = reader.next();
        int i = 0;
        while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
            firstRows.add(row.deepCopy());
            row = reader.next();
        }

        Charset charset = getCharset(firstRows, header, context);
        metadata.set(Metadata.CONTENT_ENCODING, charset.toString());

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();
        xhtml.startElement("table");
        xhtml.startElement("thead");
        for (DBFColumnHeader col : header.getCols()) {
            xhtml.startElement("th");
            xhtml.characters(col.getName(charset));
            xhtml.endElement("th");
        }
        xhtml.endElement("thead");

        xhtml.startElement("tbody");

        //now write cached rows
        while (firstRows.size() > 0) {
            DBFRow cachedRow = firstRows.remove(0);
            writeRow(cachedRow, charset, xhtml);
        }

        //now continue with rest
        while (row != null) {
            writeRow(row, charset, xhtml);
            row = reader.next();
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        xhtml.endDocument();
    }

    private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header, ParseContext parseContext)
            throws IOException, TikaException {
        //TODO: potentially use codepage info in the header
        Charset charset = DEFAULT_CHARSET;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        for (DBFRow row : firstRows) {
            for (DBFCell cell : row.cells) {
                if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
                    byte[] bytes = cell.getBytes();
                    bos.write(bytes);
                    if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
                        break;
                    }
                }
            }
        }
        byte[] bytes = bos.toByteArray();
        if (bytes.length > 20) {
            EncodingDetector detector = new Icu4jEncodingDetector();
            try (TikaInputStream tis = TikaInputStream.get(bytes)) {
                List<EncodingResult> results =
                        detector.detect(TikaInputStream.get(bytes), new Metadata(), parseContext);
                charset = results.isEmpty() ? null : results.get(0).getCharset();
            }
        }
        return charset;
    }

    private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml)
            throws SAXException {
        xhtml.startElement("tr");
        for (DBFCell cell : row.cells) {
            xhtml.startElement("td");
            xhtml.characters(cell.getString(charset));
            xhtml.endElement("td");
        }
        xhtml.endElement("tr");

    }
}