CsvParser.java

package de.siegmar.fastcsv.reader;

import static de.siegmar.fastcsv.util.Util.CR;
import static de.siegmar.fastcsv.util.Util.LF;
import static de.siegmar.fastcsv.util.Util.containsDupe;

import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;

import de.siegmar.fastcsv.util.Limits;
import de.siegmar.fastcsv.util.Preconditions;
import de.siegmar.fastcsv.util.Util;

/*
 * This class contains ugly, performance optimized code - be warned!
 */
@SuppressWarnings({
    "checkstyle:CyclomaticComplexity",
    "checkstyle:ExecutableStatementCount",
    "checkstyle:InnerAssignment",
    "checkstyle:JavaNCSS",
    "checkstyle:NestedIfDepth",
    "PMD.UnusedAssignment"
})
final class CsvParser implements Closeable {

    private static final int STATUS_LAST_CHAR_WAS_CR = 32;
    private static final int STATUS_COMMENTED_RECORD = 16;
    private static final int STATUS_NEW_FIELD = 8;
    private static final int STATUS_QUOTED_MODE = 4;
    private static final int STATUS_QUOTED_FIELD = 2;
    private static final int STATUS_DATA_FIELD = 1;
    private static final int STATUS_RESET = 0;

    private final char fsep;
    private final char qChar;
    private final CommentStrategy cStrat;
    private final char cChar;
    private final boolean acceptCharsAfterQuotes;
    private final CsvCallbackHandler<?> callbackHandler;
    private final CsvBuffer csvBuffer;

    private long startingLineNumber;
    private int lines = 1;

    private int status;
    private boolean finished;

    CsvParser(final char fieldSeparator, final char quoteCharacter,
              final CommentStrategy commentStrategy, final char commentCharacter,
              final boolean acceptCharsAfterQuotes,
              final CsvCallbackHandler<?> callbackHandler, final Reader reader) {

        assertFields(fieldSeparator, quoteCharacter, commentCharacter);

        this.fsep = fieldSeparator;
        this.qChar = quoteCharacter;
        this.cStrat = commentStrategy;
        this.cChar = commentCharacter;
        this.acceptCharsAfterQuotes = acceptCharsAfterQuotes;
        this.callbackHandler = callbackHandler;
        csvBuffer = new CsvBuffer(reader);
    }

    CsvParser(final char fieldSeparator, final char quoteCharacter,
              final CommentStrategy commentStrategy, final char commentCharacter,
              final boolean acceptCharsAfterQuotes,
              final CsvCallbackHandler<?> callbackHandler, final String data) {

        assertFields(fieldSeparator, quoteCharacter, commentCharacter);

        this.fsep = fieldSeparator;
        this.qChar = quoteCharacter;
        this.cStrat = commentStrategy;
        this.cChar = commentCharacter;
        this.acceptCharsAfterQuotes = acceptCharsAfterQuotes;
        this.callbackHandler = callbackHandler;
        csvBuffer = new CsvBuffer(data);
    }

    private void assertFields(final char fieldSeparator, final char quoteCharacter, final char commentCharacter) {
        Preconditions.checkArgument(!Util.isNewline(fieldSeparator), "fieldSeparator must not be a newline char");
        Preconditions.checkArgument(!Util.isNewline(quoteCharacter), "quoteCharacter must not be a newline char");
        Preconditions.checkArgument(!Util.isNewline(commentCharacter), "commentCharacter must not be a newline char");
        Preconditions.checkArgument(!containsDupe(fieldSeparator, quoteCharacter, commentCharacter),
            "Control characters must differ"
                + " (fieldSeparator=%s, quoteCharacter=%s, commentCharacter=%s)",
            fieldSeparator, quoteCharacter, commentCharacter);
    }

    @SuppressWarnings("checkstyle:ReturnCount")
    boolean parse() throws IOException {
        if (finished) {
            // no more data available
            return false;
        }

        startingLineNumber += lines;
        lines = 1;
        callbackHandler.beginRecord(startingLineNumber);

        do {
            if (csvBuffer.len == csvBuffer.pos && !csvBuffer.fetchData()) {
                // buffer is processed and no more data available
                finished = true;
                return processBufferTail();
            }
        } while (consume(csvBuffer.buf, csvBuffer.len));

        // we read data (and passed it to the record handler)
        return true;
    }

    private boolean processBufferTail() {
        if (csvBuffer.begin < csvBuffer.pos) {
            // we have unconsumed data in the buffer
            materialize(csvBuffer.buf, csvBuffer.begin, csvBuffer.pos, status, qChar);
            return true;
        }

        if ((status & STATUS_NEW_FIELD) != 0 || (status & STATUS_COMMENTED_RECORD) != 0) {
            // the last character was a field separator or comment character ��� add empty field
            materialize(csvBuffer.buf, 0, 0, status, qChar);
            return true;
        }

        // no data left in buffer
        return false;
    }

    @SuppressWarnings("PMD.EmptyIfStmt")
    boolean consume(final char[] lBuf, final int lLen) {
        int lPos = csvBuffer.pos;
        int lBegin = csvBuffer.begin;
        int lStatus = status;
        boolean moreDataNeeded = true;

        OUTER:
        {
            mode_check:
            do {
                if ((lStatus & STATUS_QUOTED_MODE) != 0) {
                    // we're in quotes
                    while (lPos < lLen) {
                        final char c = lBuf[lPos++];

                        if (c == qChar) {
                            lStatus &= ~STATUS_QUOTED_MODE;
                            continue mode_check;
                        } else if (c == CR) {
                            lStatus |= STATUS_LAST_CHAR_WAS_CR;
                            lines++;
                        } else if (c == LF) {
                            if ((lStatus & STATUS_LAST_CHAR_WAS_CR) == 0) {
                                lines++;
                            } else {
                                lStatus &= ~STATUS_LAST_CHAR_WAS_CR;
                            }
                        } else {
                            // fast-forward
                            for (; lPos < lLen; lPos++) {
                                final char lookAhead = lBuf[lPos];
                                if (lookAhead == qChar || lookAhead == LF || lookAhead == CR) {
                                    break;
                                }
                            }
                        }
                    }
                } else if ((lStatus & STATUS_COMMENTED_RECORD) != 0) {
                    // commented line
                    while (lPos < lLen) {
                        final char lookAhead = lBuf[lPos++];

                        if (lookAhead == CR) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            status = STATUS_LAST_CHAR_WAS_CR;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        } else if (lookAhead == LF) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            status = STATUS_RESET;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        }
                    }
                } else {
                    // we're not in quotes
                    while (lPos < lLen) {
                        final char c = lBuf[lPos++];

                        if (c == fsep) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            lStatus = STATUS_NEW_FIELD;
                            lBegin = lPos;
                        } else if (c == CR) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            status = STATUS_LAST_CHAR_WAS_CR;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        } else if (c == LF) {
                            if ((lStatus & STATUS_LAST_CHAR_WAS_CR) == 0) {
                                materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                                status = STATUS_RESET;
                                lBegin = lPos;
                                moreDataNeeded = false;
                                break OUTER;
                            }

                            lStatus = STATUS_RESET;
                            lBegin = lPos;
                        } else if (cStrat != CommentStrategy.NONE && c == cChar
                            && (lStatus == STATUS_RESET || lStatus == STATUS_LAST_CHAR_WAS_CR)) {
                            lBegin = lPos;
                            lStatus = STATUS_COMMENTED_RECORD;
                            continue mode_check;
                        } else if (c == qChar && (lStatus & STATUS_DATA_FIELD) == 0) {
                            // quote and not in data-only mode
                            lStatus = STATUS_QUOTED_FIELD | STATUS_QUOTED_MODE;
                            continue mode_check;
                        } else {
                            if ((lStatus & STATUS_QUOTED_FIELD) == 0) {
                                // normal unquoted data
                                lStatus = STATUS_DATA_FIELD;

                                // fast-forward
                                for (; lPos < lLen; lPos++) {
                                    final char lookAhead = lBuf[lPos];
                                    if (lookAhead == fsep || lookAhead == LF || lookAhead == CR) {
                                        break;
                                    }
                                }
                            } else if (!acceptCharsAfterQuotes) {
                                throw new CsvParseException("Unexpected character after closing quote: " + c);
                            }
                        }
                    }
                }
            } while (lPos < lLen);

            status = lStatus;
        }

        csvBuffer.pos = lPos;
        csvBuffer.begin = lBegin;

        return moreDataNeeded;
    }

    private void materialize(final char[] lBuf,
                             final int lBegin, final int lPos, final int lStatus,
                             final char quoteCharacter) {

        if ((lStatus & STATUS_QUOTED_FIELD) != 0) {
            // field with quotes
            final int beginAfterQuote = lBegin + 1;
            final int endAfterField = lPos - (lBuf[lPos - 1] == quoteCharacter ? 1 : 0);
            callbackHandler.addField(lBuf, beginAfterQuote,
                cleanDelimiters(lBuf, beginAfterQuote, endAfterField, quoteCharacter), true);
            return;
        }

        if ((lStatus & STATUS_COMMENTED_RECORD) != 0) {
            // commented line
            callbackHandler.setComment(lBuf, lBegin, lPos - lBegin);
            return;
        }

        // field without quotes
        callbackHandler.addField(lBuf, lBegin, lPos - lBegin, false);
    }

    /**
     * Remove escapes from the field data.
     * <p>
     * The input buffer could look like this: {@code foo ""is"" bar}
     *
     * @param buf            the buffer containing the field data
     * @param begin          the start position of the field data (after the opening quote)
     * @param end            the end position of the field data (on the closing quote / end of buffer)
     * @param quoteCharacter the quote character
     * @return the length of the field data after removing escapes
     */
    private static int cleanDelimiters(final char[] buf, final int begin, final int end,
                                       final char quoteCharacter) {

        int i = begin;

        // fast-forward to first quote
        while (i < end && buf[i] != quoteCharacter) {
            i++;
        }

        int newPos = i;
        boolean escape = false;
        for (; i < end; i++) {
            final char c = buf[i];
            if (c == quoteCharacter) {
                escape = !escape;
                if (escape) {
                    // skip quote
                    continue;
                }
            }

            // shift character
            buf[newPos++] = c;
        }

        return newPos - begin;
    }

    public long getStartingLineNumber() {
        return startingLineNumber;
    }

    @SuppressWarnings("checkstyle:HiddenField")
    void reset(final long startingLineNumber) {
        this.startingLineNumber = startingLineNumber;
        csvBuffer.reset();
    }

    @Override
    public void close() throws IOException {
        csvBuffer.close();
    }

    String peekLine() throws IOException {
        final int savedPos = csvBuffer.pos;

        for (; csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData(); csvBuffer.pos++) {
            final char c = csvBuffer.buf[csvBuffer.pos];
            if (c == CR || c == LF) {
                break;
            }
        }

        final String s = new String(csvBuffer.buf, csvBuffer.begin, csvBuffer.pos - csvBuffer.begin);
        csvBuffer.pos = savedPos;
        return s;
    }

    boolean skipLine(final int numCharsToSkip) throws IOException {
        // Skip chars that have been peeked already
        csvBuffer.pos += numCharsToSkip;

        while (csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData()) {
            final char c = csvBuffer.buf[csvBuffer.pos++];
            if (c == CR) {
                if ((csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData())
                    && csvBuffer.buf[csvBuffer.pos] == LF) {
                    // CRLF
                    csvBuffer.pos++;
                }
                break;
            } else if (c == LF) {
                break;
            }
        }

        if (csvBuffer.begin < csvBuffer.pos) {
            csvBuffer.begin = csvBuffer.pos;
            startingLineNumber++;
            return true;
        }

        return false;
    }

    @SuppressWarnings("checkstyle:visibilitymodifier")
    private static class CsvBuffer implements Closeable {

        private static final int READ_SIZE = 8192;
        private static final int BUFFER_SIZE = READ_SIZE;

        char[] buf;
        int len;
        int begin;
        int pos;

        private final Reader reader;

        CsvBuffer(final Reader reader) {
            this.reader = reader;
            buf = new char[BUFFER_SIZE];
        }

        CsvBuffer(final String data) {
            reader = null;
            buf = data.toCharArray();
            len = data.length();
        }

        /**
         * Reads data from the underlying reader and manages the local buffer.
         *
         * @return {@code true}, if data was fetched, {@code false} if the end of the stream was reached
         * @throws IOException if a read error occurs
         */
        private boolean fetchData() throws IOException {
            if (reader == null) {
                return false;
            }

            if (begin < pos) {
                // we have data that can be relocated

                if (READ_SIZE > buf.length - pos) {
                    // need to relocate data in buffer -- not enough capacity left

                    final int lenToCopy = pos - begin;
                    if (READ_SIZE > buf.length - lenToCopy) {
                        // need to relocate data in new, larger buffer
                        buf = extendAndRelocate(buf, begin);
                    } else {
                        // relocate data in existing buffer
                        System.arraycopy(buf, begin, buf, 0, lenToCopy);
                    }
                    pos -= begin;
                    begin = 0;
                }
            } else {
                // all data was consumed -- nothing to relocate
                pos = begin = 0;
            }

            final int cnt = reader.read(buf, pos, READ_SIZE);
            if (cnt == -1) {
                return false;
            }
            len = pos + cnt;
            return true;
        }

        private static char[] extendAndRelocate(final char[] buf, final int begin) {
            final int newBufferSize = buf.length * 2;
            if (newBufferSize > Limits.MAX_FIELD_SIZE) {
                throw new CsvParseException(String.format("The maximum buffer size of %d is "
                        + "insufficient to read the data of a single field. "
                        + "This issue typically arises when a quotation begins but does not conclude within the "
                        + "confines of this buffer's maximum limit.",
                    Limits.MAX_FIELD_SIZE));
            }
            final char[] newBuf = new char[newBufferSize];
            System.arraycopy(buf, begin, newBuf, 0, buf.length - begin);
            return newBuf;
        }

        private void reset() {
            len = 0;
            begin = 0;
            pos = 0;
        }

        @Override
        public void close() throws IOException {
            if (reader != null) {
                reader.close();
            }
        }

    }

}