StrictCsvParser.java

package de.siegmar.fastcsv.reader;

import static de.siegmar.fastcsv.util.Util.CR;
import static de.siegmar.fastcsv.util.Util.LF;

import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;

import de.siegmar.fastcsv.util.Nullable;
import de.siegmar.fastcsv.util.Preconditions;
import de.siegmar.fastcsv.util.Util;

/*
 * This class contains ugly, performance optimized code - be warned!
 */
@SuppressWarnings({
    "checkstyle:CyclomaticComplexity",
    "checkstyle:ExecutableStatementCount",
    "checkstyle:InnerAssignment",
    "checkstyle:JavaNCSS",
    "checkstyle:NestedIfDepth"
})
final class StrictCsvParser implements CsvParser {

    private static final int STATUS_LAST_CHAR_WAS_CR = 32;
    private static final int STATUS_COMMENTED_RECORD = 16;
    private static final int STATUS_NEW_FIELD = 8;
    private static final int STATUS_QUOTED_MODE = 4;
    private static final int STATUS_QUOTED_FIELD = 2;
    private static final int STATUS_DATA_FIELD = 1;
    private static final int STATUS_RESET = 0;

    private final char fsep;
    private final char qChar;
    private final CommentStrategy cStrat;
    private final char cChar;
    private final boolean allowExtraCharsAfterClosingQuote;
    private final CsvCallbackHandler<?> callbackHandler;
    private final CsvBuffer csvBuffer;

    private long startingLineNumber;
    private int lines = 1;
    private boolean firstField;

    private int status;
    private boolean finished;

    @SuppressWarnings("checkstyle:ParameterNumber")
    StrictCsvParser(final char fieldSeparator, final char quoteCharacter,
                    final CommentStrategy commentStrategy, final char commentCharacter,
                    final boolean allowExtraCharsAfterClosingQuote,
                    final CsvCallbackHandler<?> callbackHandler,
                    final int maxBufferSize,
                    final Reader reader) {

        assertFields(fieldSeparator, quoteCharacter, commentCharacter);

        fsep = fieldSeparator;
        qChar = quoteCharacter;
        cStrat = commentStrategy;
        cChar = commentCharacter;
        this.allowExtraCharsAfterClosingQuote = allowExtraCharsAfterClosingQuote;
        this.callbackHandler = callbackHandler;
        csvBuffer = new CsvBuffer(reader, maxBufferSize);
    }

    StrictCsvParser(final char fieldSeparator, final char quoteCharacter,
                    final CommentStrategy commentStrategy, final char commentCharacter,
                    final boolean allowExtraCharsAfterClosingQuote,
                    final CsvCallbackHandler<?> callbackHandler,
                    final String data) {

        assertFields(fieldSeparator, quoteCharacter, commentCharacter);

        fsep = fieldSeparator;
        qChar = quoteCharacter;
        cStrat = commentStrategy;
        cChar = commentCharacter;
        this.allowExtraCharsAfterClosingQuote = allowExtraCharsAfterClosingQuote;
        this.callbackHandler = callbackHandler;
        csvBuffer = new CsvBuffer(data);
    }

    private void assertFields(final char fieldSeparator, final char quoteCharacter, final char commentCharacter) {
        Preconditions.checkArgument(!Util.isNewline(fieldSeparator), "fieldSeparator must not contain newline chars");
        Preconditions.checkArgument(!Util.isNewline(quoteCharacter), "quoteCharacter must not be a newline char");
        Preconditions.checkArgument(!Util.isNewline(commentCharacter), "commentCharacter must not be a newline char");
        Preconditions.checkArgument(!Util.containsDupe(fieldSeparator, quoteCharacter, commentCharacter),
            "Control characters must differ (fieldSeparator=%s, quoteCharacter=%s, commentCharacter=%s)".formatted(
            fieldSeparator, quoteCharacter, commentCharacter));
    }

    @SuppressWarnings("checkstyle:ReturnCount")
    @Override
    public boolean parse() throws IOException {
        if (finished) {
            // no more data available
            return false;
        }

        startingLineNumber += lines;
        lines = 1;
        callbackHandler.beginRecord(startingLineNumber);
        firstField = true;

        do {
            if (csvBuffer.len == csvBuffer.pos && !csvBuffer.fetchData()) {
                // buffer is processed and no more data available
                finished = true;
                return processBufferTail();
            }
        } while (consume(csvBuffer.buf, csvBuffer.len));

        // we read data (and passed it to the record handler)
        return true;
    }

    private boolean processBufferTail() {
        if (csvBuffer.begin < csvBuffer.pos) {
            // we have unconsumed data in the buffer
            materialize(csvBuffer.buf, csvBuffer.begin, csvBuffer.pos, status, qChar);
            return true;
        }

        if ((status & STATUS_NEW_FIELD) != 0 || (status & STATUS_COMMENTED_RECORD) != 0) {
            // the last character was a field separator or comment character ��� add empty field
            materialize(csvBuffer.buf, 0, 0, status, qChar);
            return true;
        }

        // no data left in buffer
        return false;
    }

    @SuppressWarnings("LabelledBreakTarget")
    boolean consume(final char[] lBuf, final int lLen) {
        int lPos = csvBuffer.pos;
        int lBegin = csvBuffer.begin;
        int lStatus = status;
        boolean moreDataNeeded = true;

        OUTER:
        {
            mode_check:
            do {
                if ((lStatus & STATUS_QUOTED_MODE) != 0) {
                    // we're in quotes
                    while (lPos < lLen) {
                        final char c = lBuf[lPos++];

                        if (c == qChar) {
                            lStatus &= ~STATUS_QUOTED_MODE;
                            continue mode_check;
                        } else if (c == CR) {
                            lStatus |= STATUS_LAST_CHAR_WAS_CR;
                            lines++;
                        } else if (c == LF) {
                            if ((lStatus & STATUS_LAST_CHAR_WAS_CR) == 0) {
                                lines++;
                            } else {
                                lStatus &= ~STATUS_LAST_CHAR_WAS_CR;
                            }
                        } else {
                            // fast-forward
                            for (; lPos < lLen; lPos++) {
                                final char lookAhead = lBuf[lPos];
                                if (lookAhead == qChar || lookAhead == LF || lookAhead == CR) {
                                    break;
                                }
                            }
                        }
                    }
                } else if ((lStatus & STATUS_COMMENTED_RECORD) != 0) {
                    // commented line
                    while (lPos < lLen) {
                        final char lookAhead = lBuf[lPos++];

                        if (lookAhead == CR) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            status = STATUS_LAST_CHAR_WAS_CR;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        } else if (lookAhead == LF) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            status = STATUS_RESET;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        }
                    }
                } else {
                    // we're not in quotes
                    while (lPos < lLen) {
                        final char c = lBuf[lPos++];

                        if (c == fsep) {
                            materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            lStatus = STATUS_NEW_FIELD;
                            lBegin = lPos;
                            firstField = false;
                        } else if (c == CR) {
                            if (firstField && lPos - 1 == lBegin) {
                                callbackHandler.setEmpty();
                            } else {
                                materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                            }
                            status = STATUS_LAST_CHAR_WAS_CR;
                            lBegin = lPos;
                            moreDataNeeded = false;
                            break OUTER;
                        } else if (c == LF) {
                            if ((lStatus & STATUS_LAST_CHAR_WAS_CR) == 0) {
                                if (firstField && lPos - 1 == lBegin) {
                                    callbackHandler.setEmpty();
                                } else {
                                    materialize(lBuf, lBegin, lPos - 1, lStatus, qChar);
                                }
                                status = STATUS_RESET;
                                lBegin = lPos;
                                moreDataNeeded = false;
                                break OUTER;
                            }

                            lStatus = STATUS_RESET;
                            lBegin = lPos;
                        } else if (cStrat != CommentStrategy.NONE && c == cChar
                            && (lStatus == STATUS_RESET || lStatus == STATUS_LAST_CHAR_WAS_CR)) {
                            lBegin = lPos;
                            lStatus = STATUS_COMMENTED_RECORD;
                            continue mode_check;
                        } else if (c == qChar && (lStatus & STATUS_DATA_FIELD) == 0) {
                            // quote and not in data-only mode
                            lStatus = STATUS_QUOTED_FIELD | STATUS_QUOTED_MODE;
                            continue mode_check;
                        } else {
                            if ((lStatus & STATUS_QUOTED_FIELD) == 0) {
                                // normal unquoted data
                                lStatus = STATUS_DATA_FIELD;

                                // fast-forward
                                for (; lPos < lLen; lPos++) {
                                    final char lookAhead = lBuf[lPos];
                                    if (lookAhead == fsep || lookAhead == LF || lookAhead == CR) {
                                        break;
                                    }
                                }
                            } else if (!allowExtraCharsAfterClosingQuote) {
                                throw new CsvParseException("Unexpected character after closing quote: '%c' (0x%x)"
                                    .formatted(c, (int) c));
                            }
                        }
                    }
                }
            } while (lPos < lLen);

            status = lStatus;
        }

        csvBuffer.pos = lPos;
        csvBuffer.begin = lBegin;

        return moreDataNeeded;
    }

    private void materialize(final char[] lBuf,
                             final int lBegin, final int lPos, final int lStatus,
                             final char quoteCharacter) {

        if ((lStatus & STATUS_QUOTED_FIELD) != 0) {
            // field with quotes
            final int beginAfterQuote = lBegin + 1;
            final int endAfterField = lPos - (lBuf[lPos - 1] == quoteCharacter ? 1 : 0);
            callbackHandler.addField(lBuf, beginAfterQuote,
                cleanDelimiters(lBuf, beginAfterQuote, endAfterField, quoteCharacter), true);
            return;
        }

        if ((lStatus & STATUS_COMMENTED_RECORD) != 0) {
            // commented line
            callbackHandler.setComment(lBuf, lBegin, lPos - lBegin);
            return;
        }

        // field without quotes
        callbackHandler.addField(lBuf, lBegin, lPos - lBegin, false);
    }

    /// Remove escapes from the field data.
    ///
    /// The input buffer could look like this: `foo ""is"" bar`
    ///
    /// @param buf            the buffer containing the field data
    /// @param begin          the start position of the field data (after the opening quote)
    /// @param end            the end position of the field data (on the closing quote / end of buffer)
    /// @param quoteCharacter the quote character
    /// @return the length of the field data after removing escapes
    private static int cleanDelimiters(final char[] buf, final int begin, final int end,
                                       final char quoteCharacter) {

        int i = begin;

        // fast-forward to first quote
        while (i < end && buf[i] != quoteCharacter) {
            i++;
        }

        int newPos = i;
        boolean escape = false;
        for (; i < end; i++) {
            final char c = buf[i];
            if (c == quoteCharacter) {
                escape = !escape;
                if (escape) {
                    // skip quote
                    continue;
                }
            }

            // shift character
            buf[newPos++] = c;
        }

        return newPos - begin;
    }

    @Override
    public long getStartingLineNumber() {
        return startingLineNumber;
    }

    @SuppressWarnings("checkstyle:HiddenField")
    @Override
    public void reset(final long startingLineNumber) {
        this.startingLineNumber = startingLineNumber;
        csvBuffer.reset();
    }

    @Override
    public void close() throws IOException {
        csvBuffer.close();
    }

    @Override
    public String peekLine() throws IOException {
        if (csvBuffer.pos == csvBuffer.len && !csvBuffer.fetchData()) {
            throw new EOFException();
        }

        final int savedPos = csvBuffer.pos;

        for (; csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData(); csvBuffer.pos++) {
            final char c = csvBuffer.buf[csvBuffer.pos];
            if (c == CR || c == LF) {
                break;
            }
        }

        final String s = new String(csvBuffer.buf, csvBuffer.begin, csvBuffer.pos - csvBuffer.begin);
        csvBuffer.pos = savedPos;
        return s;
    }

    @Override
    public void skipLine(final int numCharsToSkip) throws IOException {
        // Skip chars that have been peeked already
        csvBuffer.pos += numCharsToSkip;

        if (csvBuffer.pos >= csvBuffer.len && !csvBuffer.fetchData()) {
            if (numCharsToSkip == 0) {
                throw new EOFException();
            }
            return;
        }

        do {
            final char c = csvBuffer.buf[csvBuffer.pos++];
            if (c == CR) {
                if ((csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData())
                    && csvBuffer.buf[csvBuffer.pos] == LF) {
                    // CRLF
                    csvBuffer.pos++;
                }
                break;
            } else if (c == LF) {
                break;
            }
        } while (csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData());

        if (csvBuffer.begin < csvBuffer.pos) {
            csvBuffer.begin = csvBuffer.pos;
            startingLineNumber++;
        }
    }

    @SuppressWarnings("checkstyle:visibilitymodifier")
    private static class CsvBuffer implements Closeable {

        private static final int DEFAULT_READ_SIZE = 8192;

        char[] buf;
        int len;
        int begin;
        int pos;

        @Nullable
        private final Reader reader;

        private final int maxBufferSize;
        private final int readSize;

        CsvBuffer(final Reader reader, final int maxBufferSize) {
            Preconditions.checkArgument(maxBufferSize > 0, "maxBufferSize must be > 0");
            this.reader = reader;
            this.maxBufferSize = maxBufferSize;

            // limit optimal read size to maxBufferSize
            readSize = Math.min(maxBufferSize, DEFAULT_READ_SIZE);

            // Buffer may still contain unprocessed data, so extra space is needed to read readSize chars.
            buf = new char[Math.min(maxBufferSize, readSize * 2)];
        }

        CsvBuffer(final String data) {
            reader = null;
            maxBufferSize = -1;
            buf = data.toCharArray();
            len = data.length();
            readSize = -1;
        }

        /// Reads data from the underlying reader and manages the local buffer.
        ///
        /// @return `true`, if data was fetched, `false` if the end of the stream was reached
        /// @throws IOException if a read error occurs
        private boolean fetchData() throws IOException {
            if (reader == null) {
                // Fixed string data
                return false;
            }

            if (buf.length - len < readSize) {
                // not enough space in the buffer to read readSize chars

                if (begin == len) {
                    // all data was consumed -- nothing to relocate
                    pos = len = 0;
                } else {
                    if (buf.length - len + begin < readSize) {
                        // reclaimable space is insufficient - allocate a larger buffer
                        final char[] newBuf = largerBuffer();
                        System.arraycopy(buf, begin, newBuf, 0, len - begin);
                        buf = newBuf;
                    } else {
                        // it's enough to relocate data and continue with the same buffer
                        System.arraycopy(buf, begin, buf, 0, len - begin);
                    }

                    pos -= begin;
                    len -= begin;
                }

                begin = 0;
            }

            final int cnt = reader.read(buf, len, readSize);
            if (cnt == -1) {
                return false;
            }
            len += cnt;
            return true;
        }

        private char[] largerBuffer() {
            if (maxBufferSize == buf.length) {
                throw new CsvParseException("""
                    The maximum buffer size of %d is \
                    insufficient to read the data of a single field. \
                    This issue typically arises when a quotation begins but does not conclude within the \
                    confines of this buffer's maximum limit. \
                    """.formatted(maxBufferSize));
            }
            return new char[Math.min(maxBufferSize, buf.length * 2)];
        }

        private void reset() {
            len = 0;
            begin = 0;
            pos = 0;
        }

        @Override
        public void close() throws IOException {
            if (reader != null) {
                reader.close();
            }
        }

    }

}