Latin1StringsParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.strings;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Parser to extract printable Latin1 strings from arbitrary files with pure java
 * without running any external process. Useful for binary or unknown files, for
 * files without a specific parser and for corrupted ones causing a TikaException
 * as a fallback parser. To enable the parsing of unknown or files without a
 * specific parser with AutoDetectParser:
 * <p>
 * AutoDetectParser parser = new AutoDetectParser();
 * parser.setFallback(new Latin1StringsParser());
 * </p>
 * Currently the parser does a best effort to extract Latin1 strings, used by
 * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets
 * mixed within the same file.
 * <p>
 * The implementation is optimized for fast parsing with only one pass.
 */
@TikaComponent(spi = false)
public class Latin1StringsParser implements Parser {

    private static final long serialVersionUID = 1L;

    /**
     * The set of supported types
     */
    private static final Set<MediaType> SUPPORTED_TYPES = getTypes();

    /**
     * The valid ISO-8859-1 character map.
     */
    private static final boolean[] isChar = getCharMap();

    /**
     * The size of the internal buffers.
     */
    private static int BUF_SIZE = 64 * 1024;

    /**
     * The minimum size of a character sequence to be extracted.
     */
    private int minSize = 4;

    /**
     * The output buffer.
     */
    private byte[] output = new byte[BUF_SIZE];

    /**
     * The input buffer.
     */
    private byte[] input = new byte[BUF_SIZE];

    /**
     * The temporary position into the output buffer.
     */
    private int tmpPos = 0;

    /**
     * The current position into the output buffer.
     */
    private int outPos = 0;

    /**
     * The number of bytes into the input buffer.
     */
    private int inSize = 0;

    /**
     * The position into the input buffer.
     */
    private int inPos = 0;

    /**
     * The output content handler.
     */
    private XHTMLContentHandler xhtml;

    /**
     * Populates the valid ISO-8859-1 character map.
     *
     * @return the valid ISO-8859-1 character map.
     */
    private static boolean[] getCharMap() {

        boolean[] isChar = new boolean[256];
        for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++)
            if ((c >= 0x20 && c <= 0x7E) || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A ||
                    c == 0x0D || c == 0x09) {
                isChar[c & 0xFF] = true;
            }
        return isChar;

    }

    /**
     * Returns the set of supported types.
     *
     * @return the set of supported types
     */
    private static Set<MediaType> getTypes() {
        HashSet<MediaType> supportedTypes = new HashSet<>();
        supportedTypes.add(MediaType.OCTET_STREAM);
        return supportedTypes;
    }

    /**
     * Tests if the byte is a ISO-8859-1 char.
     *
     * @param c the byte to test.
     * @return if the byte is a char.
     */
    private static final boolean isChar(byte c) {
        return isChar[c & 0xFF];
    }

    /**
     * Returns the minimum size of a character sequence to be extracted.
     *
     * @return the minimum size of a character sequence
     */
    public int getMinSize() {
        return minSize;
    }

    /**
     * Sets the minimum size of a character sequence to be extracted.
     *
     * @param minSize the minimum size of a character sequence
     */
    public void setMinSize(int minSize) {
        this.minSize = minSize;
    }

    /**
     * Flushes the internal output buffer to the content handler.
     *
     * @throws UnsupportedEncodingException
     * @throws SAXException
     */
    private void flushBuffer() throws UnsupportedEncodingException, SAXException {
        if (tmpPos - outPos >= minSize) {
            outPos = tmpPos - minSize;
        }

        xhtml.characters(new String(output, 0, outPos, "windows-1252"));

        if (tmpPos - outPos >= 0) {
            System.arraycopy(output, outPos, output, 0, tmpPos - outPos);
        }
        tmpPos = tmpPos - outPos;
        outPos = 0;
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
        return SUPPORTED_TYPES;
    }

    /**
     * @see org.apache.tika.parser.Parser#parse(TikaInputStream,
     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
     * org.apache.tika.parser.ParseContext)
     */
    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException {
        /*
         * Creates a new instance because the object is not immutable.
         */
        new Latin1StringsParser().doParse(tis, handler, metadata, context);
    }

    /**
     * Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
     * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
     * temporary buffer position is incremented. When an invalid char is read,
     * the difference of the temporary and current buffer position is checked.
     * If it is greater than the minimum string size, the current buffer
     * position is updated to the temp position. If it is not, the temp position
     * is reseted to the current position.
     *
     * @param stream   the input tis.
     * @param handler  the output content handler
     * @param metadata the metadata of the file
     * @param context  the parsing context
     * @throws IOException  if an io error occurs
     * @throws SAXException if a sax error occurs
     */
    private void doParse(InputStream tis, ContentHandler handler, Metadata metadata,
                         ParseContext context) throws IOException, SAXException {

        tmpPos = 0;
        outPos = 0;

        xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        int i = 0;
        do {
            inSize = 0;
            while ((i = tis.read(input, inSize, BUF_SIZE - inSize)) > 0) {
                inSize += i;
            }
            inPos = 0;
            while (inPos < inSize) {
                byte c = input[inPos++];
                boolean utf8 = false;
                /*
                 * Test for a possible UTF8 encoded char
                 */
                if (c == (byte) 0xC3) {
                    byte c_ = inPos < inSize ? input[inPos++] : (byte) tis.read();
                    /*
                     * Test if the next byte is in the valid UTF8 range
                     */
                    if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
                        utf8 = true;
                        output[tmpPos++] = (byte) (c_ + 0x40);
                    } else {
                        output[tmpPos++] = c;
                        c = c_;
                    }
                    if (tmpPos == BUF_SIZE) {
                        flushBuffer();
                    }

                    /*
                     * Test for a possible UTF8 encoded char
                     */
                } else if (c == (byte) 0xC2) {
                    byte c_ = inPos < inSize ? input[inPos++] : (byte) tis.read();
                    /*
                     * Test if the next byte is in the valid UTF8 range
                     */
                    if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
                        utf8 = true;
                        output[tmpPos++] = c_;
                    } else {
                        output[tmpPos++] = c;
                        c = c_;
                    }
                    if (tmpPos == BUF_SIZE) {
                        flushBuffer();
                    }
                }
                if (!utf8)
                    /*
                     * Test if the byte is a valid char.
                     */ {
                    if (isChar(c)) {
                        output[tmpPos++] = c;
                        if (tmpPos == BUF_SIZE) {
                            flushBuffer();
                        }
                    } else {
                        /*
                         * Test if the byte is an invalid char, marking a string
                         * end. If it is a zero, test 2 positions before or
                         * ahead for a valid char, meaning it marks the
                         * transition between ISO-8859-1 and UTF16 sequences.
                         */
                        if (c != 0 || (inPos >= 3 && isChar(input[inPos - 3])) ||
                                (inPos + 1 < inSize && isChar(input[inPos + 1]))) {

                            if (tmpPos - outPos >= minSize) {
                                output[tmpPos++] = 0x0A;
                                outPos = tmpPos;

                                if (tmpPos == BUF_SIZE) {
                                    flushBuffer();
                                }
                            } else {
                                tmpPos = outPos;
                            }

                        }
                    }
                }
            }
        } while (i != -1 && !Thread.currentThread().isInterrupted());

        if (tmpPos - outPos >= minSize) {
            output[tmpPos++] = 0x0A;
            outPos = tmpPos;
        }
        xhtml.characters(new String(output, 0, outPos, "windows-1252"));

        xhtml.endDocument();

    }

}