RelaxedCsvParser.java

package de.siegmar.fastcsv.reader;

import static de.siegmar.fastcsv.util.Util.CR;
import static de.siegmar.fastcsv.util.Util.LF;

import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import de.siegmar.fastcsv.util.Nullable;
import de.siegmar.fastcsv.util.Preconditions;
import de.siegmar.fastcsv.util.Util;

/// Less strict but also less performant CSV parser.
@SuppressWarnings({
    "checkstyle:CyclomaticComplexity",
    "checkstyle:ExecutableStatementCount",
    "checkstyle:InnerAssignment",
    "checkstyle:JavaNCSS",
    "checkstyle:NestedIfDepth"
})
final class RelaxedCsvParser implements CsvParser {

    private static final char SPACE = ' ';
    private static final int EOF = -1;
    private static final int DEFAULT_BUFFER_SIZE = 8192;

    private final char fsep;

    @Nullable
    private final char[] fsepRemainder;

    private final char qChar;
    private final CommentStrategy cStrat;
    private final char cChar;
    private final boolean trimWhitespacesAroundQuotes;
    private final CsvCallbackHandler<?> callbackHandler;
    private final int maxBufferSize;
    private final LookaheadReader reader;
    private long startingLineNumber;
    private char[] currentField;
    private int currentFieldIndex;
    private int lines = 1;

    @SuppressWarnings("checkstyle:ParameterNumber")
    RelaxedCsvParser(final String fsep, final char qChar,
                     final CommentStrategy cStrat, final char cChar,
                     final boolean trimWhitespacesAroundQuotes,
                     final CsvCallbackHandler<?> callbackHandler,
                     final int maxBufferSize,
                     final Reader reader) {
        assertFields(fsep, qChar, cChar);

        this.fsep = fsep.charAt(0);
        fsepRemainder = extractFsepRemainder(fsep);
        this.qChar = qChar;
        this.cStrat = cStrat;
        this.cChar = cChar;
        this.trimWhitespacesAroundQuotes = trimWhitespacesAroundQuotes;
        this.callbackHandler = callbackHandler;
        this.maxBufferSize = maxBufferSize;
        this.reader = new LookaheadReader(reader, DEFAULT_BUFFER_SIZE);
        currentField = new char[Math.min(maxBufferSize, DEFAULT_BUFFER_SIZE)];
    }

    @SuppressWarnings("checkstyle:ParameterNumber")
    RelaxedCsvParser(final String fsep, final char qChar,
                     final CommentStrategy cStrat, final char cChar,
                     final boolean trimWhitespacesAroundQuotes,
                     final CsvCallbackHandler<?> callbackHandler,
                     final int maxBufferSize,
                     final String data) {
        assertFields(fsep, qChar, cChar);

        this.fsep = fsep.charAt(0);
        fsepRemainder = extractFsepRemainder(fsep);
        this.qChar = qChar;
        this.cStrat = cStrat;
        this.cChar = cChar;
        this.trimWhitespacesAroundQuotes = trimWhitespacesAroundQuotes;
        this.callbackHandler = callbackHandler;
        this.maxBufferSize = maxBufferSize;
        reader = new LookaheadReader(new StringReader(data), Math.max(data.length(), 1));
        currentField = new char[Math.min(maxBufferSize, data.length())];
    }

    private void assertFields(final String fieldSeparator, final char quoteCharacter, final char commentCharacter) {
        Preconditions.checkArgument(!Util.containsNewline(fieldSeparator),
            "fieldSeparator must not contain newline chars");
        Preconditions.checkArgument(!Util.isNewline(quoteCharacter), "quoteCharacter must not be a newline char");
        Preconditions.checkArgument(!Util.isNewline(commentCharacter), "commentCharacter must not be a newline char");
        Preconditions.checkArgument(!Util.containsDupe(fieldSeparator.charAt(0), quoteCharacter, commentCharacter),
            "Control characters must differ (fieldSeparator=%s, quoteCharacter=%s, commentCharacter=%s)".formatted(
            fieldSeparator.charAt(0), quoteCharacter, commentCharacter));
    }

    @Nullable
    @SuppressWarnings({"PMD.AvoidLiteralsInIfCondition", "PMD.ReturnEmptyCollectionRatherThanNull"})
    private static char[] extractFsepRemainder(final String fsep) {
        if (fsep.length() <= 1) {
            return null;
        }
        final char[] fsepRemainder = new char[fsep.length() - 1];
        fsep.getChars(1, fsep.length(), fsepRemainder, 0);
        return fsepRemainder;
    }

    @SuppressWarnings({"checkstyle:ReturnCount", "checkstyle:NPathComplexity"})
    @Override
    public boolean parse() throws IOException {
        startingLineNumber += lines;
        lines = 1;
        callbackHandler.beginRecord(startingLineNumber);

        int ch = reader.read();

        if (ch == EOF) {
            return false;
        }
        if (ch == CR) {
            reader.consumeLF();
            callbackHandler.setEmpty();
            return true;
        }
        if (ch == LF) {
            callbackHandler.setEmpty();
            return true;
        }
        if (ch == cChar && cStrat != CommentStrategy.NONE) {
            parseComment();
            return true;
        }

        do {
            if (ch == qChar ? parseQuoted() : parseUnquoted(ch)) {
                return true;
            }
        } while ((ch = reader.read()) != EOF);

        callbackHandler.addField(currentField, 0, currentFieldIndex, false);
        currentFieldIndex = 0;
        return true;
    }

    @SuppressWarnings({
        "checkstyle:ReturnCount",
        "checkstyle:FinalParameters",
        "checkstyle:ParameterAssignment",
        "checkstyle:NPathComplexity",
        "checkstyle:BooleanExpressionComplexity",
        "PMD.AvoidReassigningParameters"
    })
    private boolean parseUnquoted(int ch) throws IOException {
        boolean endOfRecord = true;

        do {
            // fast-forward
            while (currentFieldIndex < currentField.length && reader.len > reader.start
                && ch != CR && ch != LF && ch != fsep && ch != qChar) {
                currentField[currentFieldIndex++] = (char) ch;
                ch = reader.buffer[reader.start++];
            }

            if (ch == fsep && (fsepRemainder == null || reader.consumeIf(fsepRemainder))) {
                endOfRecord = false;
                break;
            }
            if (ch == CR) {
                reader.consumeLF();
                break;
            }
            if (ch == LF) {
                break;
            }
            if (ch == qChar && trimWhitespacesAroundQuotes && currentFieldHasOnlyWhitespace()) {
                currentFieldIndex = 0;
                return parseQuoted();
            }

            appendChar(ch);
        } while ((ch = reader.read()) != EOF);

        callbackHandler.addField(currentField, 0, currentFieldIndex, false);
        currentFieldIndex = 0;
        return endOfRecord;
    }

    private boolean currentFieldHasOnlyWhitespace() {
        for (int i = 0; i < currentFieldIndex; i++) {
            if (currentField[i] > SPACE) {
                return false;
            }
        }
        return true;
    }

    @SuppressWarnings({
        "checkstyle:NPathComplexity",
        "checkstyle:ReturnCount",
        "checkstyle:BooleanExpressionComplexity",
        "PMD.AssignmentInOperand"
    })
    private boolean parseQuoted() throws IOException {
        boolean endOfRecord = true;

        int ch;
        OUTER: while ((ch = reader.read()) != EOF) {
            // fast-forward
            while (currentFieldIndex < currentField.length && reader.len > reader.start
                && ch != CR && ch != LF && ch != qChar) {
                currentField[currentFieldIndex++] = (char) ch;
                ch = reader.buffer[reader.start++];
            }

            if (ch == qChar && (ch = reader.read()) != qChar) {
                // closing quote
                for (; ch != EOF; ch = reader.read()) {
                    if (ch == CR) {
                        // CR right after closing quote
                        reader.consumeLF();
                        break OUTER;
                    }
                    if (ch == LF) {
                        // LF right after closing quote
                        break OUTER;
                    }
                    if (ch == fsep && (fsepRemainder == null || reader.consumeIf(fsepRemainder))) {
                        // field separator after closing quote
                        endOfRecord = false;
                        break OUTER;
                    }
                    if (!trimWhitespacesAroundQuotes || ch > SPACE) {
                        throw new CsvParseException("Unexpected character after closing quote: '%c' (0x%x)"
                            .formatted(ch, ch));
                    }
                }

                break;
            }

            appendChar(ch);
            if (ch == CR) {
                if (reader.consumeLF()) {
                    appendChar(LF);
                }
                lines++;
            } else if (ch == LF) {
                lines++;
            }
        }

        callbackHandler.addField(currentField, 0, currentFieldIndex, true);
        currentFieldIndex = 0;
        return endOfRecord;
    }

    @SuppressWarnings("PMD.AssignmentInOperand")
    private void parseComment() throws IOException {
        int ch;
        while ((ch = reader.read()) != EOF && ch != LF) {
            if (ch == CR) {
                reader.consumeLF();
                break;
            }
            appendChar(ch);
        }

        callbackHandler.setComment(currentField, 0, currentFieldIndex);
        currentFieldIndex = 0;
    }

    private void appendChar(final int ch) {
        if (currentField.length == currentFieldIndex) {
            if (currentField.length == maxBufferSize) {
                throw new CsvParseException("""
                    The maximum buffer size of %d is \
                    insufficient to read the data of a single field. \
                    This issue typically arises when a quotation begins but does not conclude within the \
                    confines of this buffer's maximum limit. \
                    """.formatted(maxBufferSize));
            }
            final char[] newField = new char[Math.min(maxBufferSize, currentField.length * 2)];
            System.arraycopy(currentField, 0, newField, 0, currentField.length);
            currentField = newField;
        }

        currentField[currentFieldIndex++] = (char) ch;
    }

    @Override
    public String peekLine() throws IOException {
        return reader.peekLine();
    }

    @SuppressWarnings("checkstyle:MultipleVariableDeclarations")
    @Override
    public void skipLine(final int numCharsToSkip) throws IOException {
        reader.skip(numCharsToSkip);

        int c = reader.read();
        if (c == EOF) {
            if (numCharsToSkip == 0) {
                throw new EOFException();
            }
            return;
        }

        do {
            if (c == CR) {
                reader.consumeLF();
                startingLineNumber++;
                break;
            }
            if (c == LF) {
                startingLineNumber++;
                break;
            }
        } while ((c = reader.read()) != EOF);
    }

    @Override
    public long getStartingLineNumber() {
        return startingLineNumber;
    }

    @SuppressWarnings("checkstyle:HiddenField")
    @Override
    public void reset(final long startingLineNumber) {
        // The IndexedCsvReader currently does not support relaxed parsing.
        throw new UnsupportedOperationException();
    }

    @Override
    public void close() throws IOException {
        reader.close();
    }

    private static final class LookaheadReader implements Closeable {

        private final Reader reader;
        private final char[] buffer;
        private int start;
        private int len;

        LookaheadReader(final Reader reader, final int bufferSize) {
            this.reader = reader;
            buffer = new char[bufferSize];
        }

        int read() throws IOException {
            ensureBuffered(1);
            return start >= len ? -1 : buffer[start++];
        }

        boolean consumeLF() throws IOException {
            ensureBuffered(1);
            if (start >= len || buffer[start] != LF) {
                return false;
            }
            start++;
            return true;
        }

        @SuppressWarnings("PMD.UseVarargs")
        boolean consumeIf(final char[] chars) throws IOException {
            ensureBuffered(chars.length);
            if (len - start < chars.length) {
                return false;
            }
            for (int i = 0; i < chars.length; i++) {
                if (buffer[start + i] != chars[i]) {
                    return false;
                }
            }
            start += chars.length;
            return true;
        }

        String peekLine() throws IOException {
            ensureBuffered(buffer.length);
            if (start >= len) {
                throw new EOFException();
            }
            int endIndex = start;
            while (endIndex < len && buffer[endIndex] != CR && buffer[endIndex] != LF) {
                endIndex++;
            }
            return new String(buffer, start, endIndex - start);
        }

        private void ensureBuffered(final int required) throws IOException {
            final int available = len - start;
            if (len == -1 || required <= available) {
                return;
            }

            // relocate the buffer if necessary
            if (start > 0 && required > buffer.length - start) {
                final int remaining = len - start;
                System.arraycopy(buffer, start, buffer, 0, remaining);
                start = 0;
                len = remaining;
            }

            // fetch more data
            while (len - start < required) {
                final int count = reader.read(buffer, len, buffer.length - len);
                if (count == -1) {
                    len = (start >= len) ? -1 : len;
                    break;
                }
                len += count;
            }
        }

        void skip(final int numCharsToSkip) {
            start += numCharsToSkip;
        }

        @Override
        public void close() throws IOException {
            reader.close();
        }

    }

}