CSVReaderUTF8.java
package com.alibaba.fastjson2.support.csv;
import com.alibaba.fastjson2.JSONException;
import com.alibaba.fastjson2.JSONFactory;
import com.alibaba.fastjson2.reader.*;
import com.alibaba.fastjson2.stream.StreamReader;
import com.alibaba.fastjson2.util.DateUtils;
import com.alibaba.fastjson2.util.Fnv;
import com.alibaba.fastjson2.util.IOUtils;
import com.alibaba.fastjson2.util.TypeUtils;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Type;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Consumer;
import java.util.function.Function;
import static com.alibaba.fastjson2.util.DateUtils.DEFAULT_ZONE_ID;
final class CSVReaderUTF8<T>
extends CSVReader<T> {
static final int ESCAPE_INDEX_NOT_SET = -2;
int nextEscapeIndex = ESCAPE_INDEX_NOT_SET;
static final Map<Long, Function<Consumer, ByteArrayValueConsumer>> valueConsumerCreators
= new ConcurrentHashMap<>();
byte[] buf;
InputStream input;
Charset charset = StandardCharsets.UTF_8;
ByteArrayValueConsumer valueConsumer;
CSVReaderUTF8(Feature... features) {
for (Feature feature : features) {
this.features |= feature.mask;
}
}
CSVReaderUTF8(byte[] bytes, int off, int len, Charset charset, Class<T> objectClass) {
super(objectClass);
this.buf = bytes;
this.off = off;
this.end = off + len;
this.charset = charset;
}
CSVReaderUTF8(byte[] bytes, int off, int len, Charset charset, ByteArrayValueConsumer valueConsumer) {
this.valueConsumer = valueConsumer;
this.buf = bytes;
this.off = off;
this.end = off + len;
this.charset = charset;
}
CSVReaderUTF8(byte[] bytes, int off, int len, Type[] types) {
super(types);
this.buf = bytes;
this.off = off;
this.end = off + len;
this.types = types;
}
CSVReaderUTF8(byte[] bytes, int off, int len, Class<T> objectClass) {
super(objectClass);
this.buf = bytes;
this.off = off;
this.end = off + len;
}
CSVReaderUTF8(InputStream input, Charset charset, Type[] types) {
super(types);
this.charset = charset;
this.input = input;
}
CSVReaderUTF8(InputStream input, Charset charset, Class<T> objectClass) {
super(objectClass);
this.charset = charset;
this.input = input;
}
CSVReaderUTF8(InputStream input, Charset charset, ByteArrayValueConsumer valueConsumer) {
this.charset = charset;
this.input = input;
this.valueConsumer = valueConsumer;
}
static boolean containsQuoteOrLineSeparator(long v) {
/*
for (int i = 0; i < 8; ++i) {
byte c = (byte) v;
if (c == '"' || c == '\n' || c == '\r') {
return true;
}
v >>>= 8;
}
return false;
*/
long x22 = v ^ 0x2222222222222222L; // " -> 0x22
long x0a = v ^ 0x0A0A0A0A0A0A0A0AL; // \n -> 0x0a
long x0d = v ^ 0x0D0D0D0D0D0D0D0DL; // \r -> 0x0d
x22 = (x22 - 0x0101010101010101L) & ~x22;
x0a = (x0a - 0x0101010101010101L) & ~x0a;
x0d = (x0d - 0x0101010101010101L) & ~x0d;
return ((x22 | x0a | x0d) & 0x8080808080808080L) != 0;
}
private byte[] getBuf() throws IOException {
byte[] buf = this.buf;
if (buf != null) {
return buf;
}
return initBuf();
}
private byte[] initBuf() throws IOException {
if (input == null) {
throw new JSONException("init buf error");
}
byte[] buf = new byte[SIZE_512K];
int end = 0;
while (end < buf.length) {
int cnt = input.read(buf, off, buf.length - off);
if (cnt == -1) {
inputEnd = true;
break;
}
end += cnt;
}
this.end = end;
if (end > 4 && IOUtils.isUTF8BOM(buf, 0)) {
off = 3;
lineNextStart = 3;
}
this.buf = buf;
return buf;
}
protected boolean seekLine() throws IOException {
byte[] buf = getBuf();
int off = this.off, end = this.end;
int escapeIndex = this.nextEscapeIndex;
if (escapeIndex == ESCAPE_INDEX_NOT_SET || (escapeIndex != -1 && escapeIndex < off)) {
nextEscapeIndex = escapeIndex = IOUtils.indexOfDoubleQuote(buf, off, end);
}
int lineSeparatorIndex = IOUtils.indexOfLineSeparator(buf, off, end);
if (lineSeparatorIndex == -1) {
if (input != null) {
if (!inputEnd) {
int rest = end - off;
if (rest > 0) {
System.arraycopy(buf, off, buf, 0, rest);
}
end = rest;
while (end < buf.length) {
int cnt = input.read(buf, end, buf.length - end);
if (cnt != -1) {
end += cnt;
} else {
inputEnd = true;
break;
}
}
this.nextEscapeIndex = escapeIndex >= off
? escapeIndex - off
: IOUtils.indexOfDoubleQuote(buf, rest, end);
lineSeparatorIndex = IOUtils.indexOfLineSeparator(buf, rest, end);
this.off = off = 0;
this.end = end;
}
}
if (lineSeparatorIndex == -1 && inputEnd) {
if (off < end) {
lineSeparatorIndex = end;
} else {
return false;
}
}
}
if ((lineSeparatorIndex != -1) && (escapeIndex == -1 || escapeIndex > lineSeparatorIndex)) {
this.lineTerminated = true;
this.lineStart = off;
this.lineEnd = lineSeparatorIndex != off && buf[lineSeparatorIndex - 1] == '\r'
? lineSeparatorIndex - 1
: lineSeparatorIndex;
this.off = lineSeparatorIndex + 1;
return true;
}
return seekLine0(buf, off, end);
}
private boolean seekLine0(byte[] buf, int off, int end) throws IOException {
int lineNextStart = off;
for (int k = 0; k < 3; ++k) {
lineTerminated = false;
for (int i = off; i < end; i++) {
byte ch = buf[i];
if (ch == '"') {
lineSize++;
if (!quote) {
quote = true;
} else {
int n = i + 1;
if (n >= end) {
break;
}
if (buf[n] == '"') {
lineSize++;
i++;
} else {
quote = false;
}
}
continue;
}
if (quote) {
lineSize++;
continue;
}
if (ch == '\r' || ch == '\n') {
if (lineSize > 0 || (features & Feature.IgnoreEmptyLine.mask) == 0) {
rowCount++;
}
lineTerminated = true;
lineSize = 0;
lineEnd = i;
if (ch == '\r') {
int n = i + 1;
if (n >= end) {
break;
}
if (buf[n] == '\n') {
i++;
}
}
lineStart = lineNextStart;
lineNextStart = off = i + 1;
break;
} else {
lineSize++;
}
}
if (!lineTerminated) {
if (input != null && !inputEnd) {
int len = end - off;
int escapedIndex = this.nextEscapeIndex;
if (off > 0) {
if (len > 0) {
System.arraycopy(buf, off, buf, 0, len);
this.nextEscapeIndex = escapedIndex >= off ? escapedIndex - off : ESCAPE_INDEX_NOT_SET;
}
lineStart = lineNextStart = 0;
off = 0;
end = len;
quote = false;
}
int cnt = input.read(buf, end, buf.length - end);
if (cnt == -1) {
inputEnd = true;
if (off == end) {
this.off = off;
return false;
}
} else {
end += cnt;
this.end = end;
continue;
}
}
lineStart = lineNextStart;
lineEnd = end;
rowCount++;
lineSize = 0;
off = end;
}
lineTerminated = off == end;
break;
}
this.off = off;
return true;
}
Object readValue(byte[] bytes, int off, int len, Type type) {
if (len == 0) {
return null;
}
if (type == Integer.class) {
return TypeUtils.parseInt(bytes, off, len);
}
if (type == Long.class) {
return TypeUtils.parseLong(bytes, off, len);
}
if (type == BigDecimal.class) {
return TypeUtils.parseBigDecimal(bytes, off, len);
}
if (type == Float.class) {
return TypeUtils.parseFloat(bytes, off, len);
}
if (type == Double.class) {
return TypeUtils.parseDouble(bytes, off, len);
}
if (type == Date.class) {
long millis = DateUtils.parseMillis(bytes, off, len, charset, DEFAULT_ZONE_ID);
return new Date(millis);
}
if (type == Boolean.class) {
return TypeUtils.parseBoolean(bytes, off, len);
}
String str = new String(bytes, off, len, charset);
return TypeUtils.cast(str, type);
}
public boolean isEnd() {
return inputEnd;
}
public Object[] readLineValues(boolean strings) {
try {
if (inputEnd && off == end) {
return null;
}
if (input == null) {
if (off >= end) {
return null;
}
}
boolean result = seekLine();
if (!result) {
return null;
}
} catch (IOException e) {
throw new JSONException("seekLine error", e);
}
int escapedIndex = this.nextEscapeIndex;
return escapedIndex == -1 || escapedIndex >= lineEnd
? readLineValue(strings)
: readLineValueEscaped(strings);
}
private Object[] readLineValue(boolean strings) {
List<String> columns = this.columns;
Object[] values = null;
List<Object> valueList = null;
if (columns != null) {
int size = columns.size();
values = strings ? new String[size] : new Object[size];
} else {
valueList = new ArrayList<>();
}
int valueStart = lineStart, lineEnd = this.lineEnd;
int valueSize = 0;
int columnIndex = 0;
byte[] buf = this.buf;
for (int i = lineStart; i < lineEnd; ++i) {
if (buf[i] == ',') {
readValue(strings, columnIndex, valueSize, buf, valueStart, values, valueList);
valueStart = i + 1;
valueSize = 0;
columnIndex++;
continue;
}
valueSize++;
}
if (valueSize > 0) {
readValue(strings, columnIndex, valueSize, buf, valueStart, values, valueList);
}
if (values == null) {
if (valueList != null) {
int size = valueList.size();
valueList.toArray(
values = strings ? new String[size] : new Object[size]
);
}
}
if (input == null && off == end) {
inputEnd = true;
}
return values;
}
private void readValue(boolean strings,
int columnIndex,
int valueSize,
byte[] buf,
int valueStart,
Object[] values,
List<Object> valueList) {
Type type = types != null && columnIndex < types.length ? types[columnIndex] : null;
Object value;
if (type == null || type == String.class || type == Object.class || strings) {
byte c0, c1;
if (valueSize == 1 && (c0 = buf[valueStart]) >= 0) {
value = TypeUtils.toString((char) c0);
} else if (valueSize == 2
&& (c0 = buf[valueStart]) >= 0
&& (c1 = buf[valueStart + 1]) >= 0
) {
value = TypeUtils.toString((char) c0, (char) c1);
} else {
value = new String(buf, valueStart, valueSize, charset);
}
} else {
try {
value = readValue(buf, valueStart, valueSize, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
if (values != null) {
if (columnIndex < values.length) {
values[columnIndex] = value;
}
} else {
valueList.add(value);
}
}
private Object[] readLineValueEscaped(boolean strings) {
List<String> columns = this.columns;
Object[] values = null;
List<Object> valueList = null;
if (columns != null) {
int size = columns.size();
values = strings ? new String[size] : new Object[size];
}
boolean quote = false;
int valueStart = lineStart, lineEnd = this.lineEnd;
int valueSize = 0;
int escapeCount = 0;
int columnIndex = 0;
byte[] buf = this.buf;
for (int i = lineStart; i < lineEnd; ++i) {
byte ch = buf[i];
if (quote) {
if (ch == '"') {
int n = i + 1;
if (n < lineEnd) {
byte c1 = buf[n];
if (c1 == '"') {
valueSize += 2;
escapeCount++;
++i;
continue;
} else if (c1 == ',') {
++i;
ch = c1;
}
} else if (n == lineEnd) {
break;
}
} else {
valueSize++;
continue;
}
} else {
if (ch == '"') {
quote = true;
continue;
}
}
if (ch == ',') {
Type type = types != null && columnIndex < types.length ? types[columnIndex] : null;
Object value;
if (quote) {
if (escapeCount == 0) {
if (type == null || type == String.class || type == Object.class || strings) {
value = new String(buf, valueStart + 1, valueSize, charset);
} else {
try {
value = readValue(buf, valueStart + 1, valueSize, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
} else {
byte[] bytes = new byte[valueSize - escapeCount];
int valueEnd = valueStart + valueSize;
for (int j = valueStart + 1, k = 0; j < valueEnd; ++j) {
byte c = buf[j];
bytes[k++] = c;
if (c == '"' && buf[j + 1] == '"') {
++j;
}
}
if (type == null || type == String.class || type == Object.class) {
value = new String(bytes, charset);
} else {
try {
value = readValue(bytes, 0, bytes.length, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
}
} else {
if (type == null || type == String.class || type == Object.class || strings) {
byte c0, c1;
if (valueSize == 1 && (c0 = buf[valueStart]) >= 0) {
value = TypeUtils.toString((char) c0);
} else if (valueSize == 2
&& (c0 = buf[valueStart]) >= 0
&& (c1 = buf[valueStart + 1]) >= 0
) {
value = TypeUtils.toString((char) c0, (char) c1);
} else {
value = new String(buf, valueStart, valueSize, charset);
}
} else {
try {
value = readValue(buf, valueStart, valueSize, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
}
if (values != null) {
if (columnIndex < values.length) {
values[columnIndex] = value;
}
} else {
if (valueList == null) {
valueList = new ArrayList<>();
}
valueList.add(value);
}
quote = false;
valueStart = i + 1;
valueSize = 0;
escapeCount = 0;
columnIndex++;
continue;
}
valueSize++;
}
if (valueSize > 0 || quote) {
Type type = types != null && columnIndex < types.length ? types[columnIndex] : null;
Object value;
if (quote) {
if (escapeCount == 0) {
if (type == null || type == String.class || type == Object.class || strings) {
value = new String(buf, valueStart + 1, valueSize, charset);
} else {
try {
value = readValue(buf, valueStart + 1, valueSize, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
} else {
byte[] bytes = new byte[valueSize - escapeCount];
int valueEnd = lineEnd;
for (int j = valueStart + 1, k = 0; j < valueEnd; ++j) {
byte c = buf[j];
bytes[k++] = c;
if (c == '"' && buf[j + 1] == '"') {
++j;
}
}
if (type == null || type == String.class || type == Object.class || strings) {
value = new String(bytes, 0, bytes.length, charset);
} else {
try {
value = readValue(bytes, 0, bytes.length, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
}
} else {
if (type == null || type == String.class || type == Object.class || strings) {
byte c0, c1;
if (valueSize == 1 && (c0 = buf[valueStart]) >= 0) {
value = TypeUtils.toString((char) c0);
} else if (valueSize == 2
&& (c0 = buf[valueStart]) >= 0
&& (c1 = buf[valueStart + 1]) >= 0
) {
value = TypeUtils.toString((char) c0, (char) c1);
} else {
value = new String(buf, valueStart, valueSize, charset);
}
} else {
try {
value = readValue(buf, valueStart, valueSize, type);
} catch (Exception e) {
value = error(columnIndex, e);
}
}
}
if (values != null) {
if (columnIndex < values.length) {
values[columnIndex] = value;
}
} else {
if (valueList == null) {
valueList = new ArrayList<>();
}
valueList.add(value);
}
}
if (values == null) {
if (valueList != null) {
int size = valueList.size();
valueList.toArray(
values = strings ? new String[size] : new Object[size]
);
}
}
if (input == null && off == end) {
inputEnd = true;
}
return values;
}
@Override
public void close() {
if (input != null) {
IOUtils.close(input);
}
}
public void statAll() {
ByteArrayValueConsumer consumer = (row, column, bytes, off, len, charset) -> {
StreamReader.ColumnStat stat = getColumnStat(column);
stat.stat(bytes, off, len, charset);
};
readAll(consumer, Integer.MAX_VALUE);
}
public void statAll(int maxRows) {
ByteArrayValueConsumer consumer = (row, column, bytes, off, len, charset) -> {
StreamReader.ColumnStat stat = getColumnStat(column);
stat.stat(bytes, off, len, charset);
};
readAll(consumer, maxRows);
}
public void readAll() {
if (valueConsumer == null) {
throw new JSONException("unsupported operation, consumer is null");
}
readAll(valueConsumer, Integer.MAX_VALUE);
}
public void readAll(int maxRows) {
if (valueConsumer == null) {
throw new JSONException("unsupported operation, consumer is null");
}
readAll(valueConsumer, maxRows);
}
public void readLineObjectAll(boolean readHeader, Consumer<T> consumer) {
if (readHeader) {
readHeader();
}
if (fieldReaders == null) {
while (true) {
Object[] line = readLineValues(false);
if (line == null) {
break;
}
consumer.accept((T) line);
}
return;
}
ObjectReaderProvider provider = JSONFactory.getDefaultObjectReaderProvider();
// valueConsumerCreators
if (this.fieldReaders == null) {
if (objectClass != null) {
ObjectReaderAdapter objectReader = (ObjectReaderAdapter) provider.getObjectReader(objectClass);
this.fieldReaders = objectReader.getFieldReaders();
this.objectCreator = provider.createObjectCreator(objectClass, features);
}
}
Function<Consumer, ByteArrayValueConsumer> valueConsumerCreator;
String[] strings = new String[this.fieldReaders.length + 1];
strings[0] = objectClass.getName();
for (int i = 0; i < this.fieldReaders.length; i++) {
strings[i + 1] = this.fieldReaders[i].fieldName;
}
long fullNameHash = Fnv.hashCode64(strings);
valueConsumerCreator = valueConsumerCreators.get(fullNameHash);
if (valueConsumerCreator == null) {
valueConsumerCreator = provider
.createValueConsumerCreator(objectClass, fieldReaders);
if (valueConsumerCreator != null) {
valueConsumerCreators.putIfAbsent(fullNameHash, valueConsumerCreator);
}
}
ByteArrayValueConsumer bytesConsumer = null;
if (valueConsumerCreator != null) {
bytesConsumer = valueConsumerCreator.apply(consumer);
}
if (bytesConsumer == null) {
bytesConsumer = new ByteArrayConsumerImpl(consumer);
}
readAll(bytesConsumer, Integer.MAX_VALUE);
}
class ByteArrayConsumerImpl
implements ByteArrayValueConsumer {
protected Object object;
final Consumer consumer;
public ByteArrayConsumerImpl(Consumer consumer) {
this.consumer = consumer;
}
@Override
public final void beforeRow(int row) {
if (objectCreator != null) {
object = objectCreator.get();
}
}
@Override
public void accept(int row, int column, byte[] bytes, int off, int len, Charset charset) {
if (column >= fieldReaders.length || len == 0) {
return;
}
FieldReader fieldReader = fieldReaders[column];
Object fieldValue = readValue(bytes, off, len, fieldReader.fieldType);
fieldReader.accept(object, fieldValue);
}
@Override
public final void afterRow(int row) {
consumer.accept(object);
object = null;
}
}
private void readAll(ByteArrayValueConsumer consumer, int maxRows) {
consumer.start();
for (int r = 0; r < maxRows || maxRows < 0; ++r) {
try {
if (inputEnd & off == end) {
break;
}
if (input == null) {
if (off >= end) {
break;
}
}
boolean result = seekLine();
if (!result) {
break;
}
} catch (IOException e) {
throw new JSONException("seekLine error", e);
}
consumer.beforeRow(rowCount);
int escapedIndex = this.nextEscapeIndex;
if (escapedIndex == -1 || escapedIndex >= lineEnd) {
readLine(consumer);
} else {
readLineEscaped(consumer);
}
consumer.afterRow(rowCount);
}
consumer.end();
}
private void readLine(ByteArrayValueConsumer consumer) {
int valueStart = lineStart;
int valueSize = 0;
int columnIndex = 0;
byte[] buf = this.buf;
for (int i = lineStart; i < lineEnd; ++i) {
if (buf[i] == ',') {
consumer.accept(rowCount, columnIndex, buf, valueStart, valueSize, charset);
valueStart = i + 1;
valueSize = 0;
columnIndex++;
continue;
}
valueSize++;
}
if (valueSize > 0) {
consumer.accept(rowCount, columnIndex, buf, valueStart, valueSize, charset);
}
}
private void readLineEscaped(ByteArrayValueConsumer consumer) {
boolean quote = false;
int valueStart = lineStart;
int valueSize = 0;
int escapeCount = 0;
int columnIndex = 0;
for (int i = lineStart; i < lineEnd; ++i) {
byte ch = buf[i];
if (quote) {
if (ch == '"') {
int n = i + 1;
if (n < lineEnd) {
byte c1 = buf[n];
if (c1 == '"') {
valueSize += 2;
escapeCount++;
++i;
continue;
} else if (c1 == ',') {
++i;
ch = c1;
}
} else if (n == lineEnd) {
break;
}
} else {
valueSize++;
continue;
}
} else {
if (ch == '"') {
quote = true;
continue;
}
}
if (ch == ',') {
byte[] columnBuf = buf;
int columnStart = 0;
int columnSize = valueSize;
if (quote) {
if (escapeCount == 0) {
columnStart = valueStart + 1;
} else {
byte[] bytes = new byte[valueSize - escapeCount];
int valueEnd = valueStart + valueSize;
for (int j = valueStart + 1, k = 0; j < valueEnd; ++j) {
byte c = buf[j];
bytes[k++] = c;
if (c == '"' && buf[j + 1] == '"') {
++j;
}
}
columnBuf = bytes;
columnSize = bytes.length;
}
} else {
columnStart = valueStart;
}
consumer.accept(rowCount, columnIndex, columnBuf, columnStart, columnSize, charset);
quote = false;
valueStart = i + 1;
valueSize = 0;
escapeCount = 0;
columnIndex++;
continue;
}
valueSize++;
}
if (valueSize > 0) {
byte[] columnBuf = buf;
int columnStart = 0;
int columnSize = valueSize;
if (quote) {
if (escapeCount == 0) {
columnStart = valueStart + 1;
} else {
byte[] bytes = new byte[valueSize - escapeCount];
int valueEnd = lineEnd;
for (int j = valueStart + 1, k = 0; j < valueEnd; ++j) {
byte c = buf[j];
bytes[k++] = c;
if (c == '"' && buf[j + 1] == '"') {
++j;
}
}
columnBuf = bytes;
columnSize = bytes.length;
}
} else {
columnStart = valueStart;
}
consumer.accept(rowCount, columnIndex, columnBuf, columnStart, columnSize, charset);
}
}
}