CsvReaderTest.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package tech.tablesaw.io.csv;
import static java.util.Arrays.asList;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static tech.tablesaw.api.ColumnType.*;
import com.google.common.collect.ImmutableMap;
import com.univocity.parsers.common.TextParsingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.file.Paths;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.DateTimeColumn;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.LongColumn;
import tech.tablesaw.api.ShortColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.datetimes.DateTimeParser;
import tech.tablesaw.columns.numbers.DoubleColumnType;
import tech.tablesaw.columns.numbers.NumberColumnFormatter;
import tech.tablesaw.io.AddCellToColumnException;
/** Tests for CSV Reading */
public class CsvReaderTest {
private static final String LINE_END = System.lineSeparator();
private static final String COMMA = ",";
private final ColumnType[] bus_types = {SHORT, STRING, STRING, FLOAT, FLOAT};
private final ColumnType[] bus_types_with_SKIP = {SHORT, STRING, SKIP, DOUBLE, DOUBLE};
@Test
public void testMaxCharsPerColumnPass() throws IOException {
final Reader reader =
new StringReader("Text" + LINE_END + "\"short\"" + LINE_END + "1234567890" + LINE_END);
final int maxCharsPerColumn = 12;
Table result =
Table.read().csv(CsvReadOptions.builder(reader).maxCharsPerColumn(maxCharsPerColumn));
assertEquals(2, result.rowCount());
}
@Test
public void testMaxCharsPerColumnException() {
final Reader reader =
new StringReader("Text" + LINE_END + "\"short\"" + LINE_END + "1234567890" + LINE_END);
final int maxCharsPerColumn = 8;
assertThrows(
TextParsingException.class,
() -> {
Table.read().csv(CsvReadOptions.builder(reader).maxCharsPerColumn(maxCharsPerColumn));
});
}
@Test
public void testWithBusData() throws IOException {
// Read the CSV file
Table table =
Table.read()
.csv(CsvReadOptions.builder("../data/bus_stop_test.csv").columnTypes(bus_types));
// Look at the column names
assertEquals(
"[stop_id, stop_name, stop_desc, stop_lat, stop_lon]", table.columnNames().toString());
table = table.sortDescendingOn("stop_id");
table.removeColumns("stop_desc");
}
@Test
@Disabled
/** TODO: Remove. Text columns should be used as backing column types */
void textColumnShutoff() throws IOException {
Table table = Table.read().csv(CsvReadOptions.builder("../data/cities-states-zipcode.csv"));
ColumnType[] types = {INTEGER, STRING, DOUBLE, BOOLEAN};
List<ColumnType> typesToDetect = asList(types);
Table table2 =
Table.read()
.csv(
CsvReadOptions.builder("../data/cities-states-zipcode.csv")
.columnTypesToDetect(typesToDetect)
.build());
assertEquals(STRING, table.column("WorldRegion").type());
assertEquals(STRING, table2.column("WorldRegion").type());
}
@Test
public void testWithColumnSKIP() throws IOException {
// Read the CSV file
Table table =
Table.read()
.csv(
CsvReadOptions.builder("../data/bus_stop_test.csv")
.columnTypes(bus_types_with_SKIP));
assertEquals(4, table.columnCount());
// Look at the column names
assertEquals("[stop_id, stop_name, stop_lat, stop_lon]", table.columnNames().toString());
}
@Test
void allowDuplicateColumnNames() throws IOException {
final Reader reader1 =
new StringReader(
"Col1" + COMMA + "Col2" + LINE_END + "\"first\"" + COMMA + "second" + LINE_END);
Table noDupes = Table.read().csv(reader1);
assertEquals("Col1", noDupes.columnNames().get(0));
assertEquals("Col2", noDupes.columnNames().get(1));
final Reader reader2 =
new StringReader(
"Col1" + COMMA + "Col1" + LINE_END + "\"first\"" + COMMA + "second" + LINE_END);
Table dupes =
Table.read().csv(CsvReadOptions.builder(reader2).allowDuplicateColumnNames(true).build());
assertEquals("Col1", dupes.columnNames().get(0));
assertEquals("Col1-2", dupes.columnNames().get(1));
}
@Test
void allowDuplicateColumnNamesInsensitive() {
Reader reader2 =
new StringReader(
"Col1" + COMMA + "col1" + LINE_END + "first" + COMMA + "second" + LINE_END);
Table dupes =
Table.read().csv(CsvReadOptions.builder(reader2).allowDuplicateColumnNames(true).build());
assertEquals("Col1", dupes.columnNames().get(0));
assertEquals("col1-2", dupes.columnNames().get(1));
}
@Test
public void testWithColumnSKIPWithoutHeader() throws IOException {
// Read the CSV file
Table table =
Table.read()
.csv(
CsvReadOptions.builder("../data/bus_stop_noheader_test.csv")
.header(false)
.columnTypes(bus_types_with_SKIP));
assertEquals(4, table.columnCount());
// Look at the column names
assertEquals("[C0, C1, C3, C4]", table.columnNames().toString());
}
@Test
public void testWithBushData() throws IOException {
// Read the CSV file
ColumnType[] types = {LOCAL_DATE, DOUBLE, STRING};
Table table = Table.read().csv(CsvReadOptions.builder("../data/bush.csv").columnTypes(types));
assertEquals(323, table.rowCount());
// Look at the column names
assertEquals("[date, approval, who]", table.columnNames().toString());
}
@Test
public void testBushDataWithoutSamplingForTypeDetection() throws IOException {
// Read the CSV file
Table table = Table.read().csv(CsvReadOptions.builder("../data/bush.csv").sample(false));
assertEquals(323, table.rowCount());
// Look at the column names
assertEquals("[date, approval, who]", table.columnNames().toString());
}
@Test
public void testDataTypeDetection() throws IOException {
Reader reader = new FileReader("../data/bus_stop_test.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.minimizeColumnSizes()
.separator(',')
.sample(false)
.locale(Locale.getDefault())
.build();
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertArrayEquals(bus_types, columnTypes);
}
@Test
public void testNumberTypeDetectionIgnoreZeroDecimal() throws IOException {
Reader reader = new FileReader("../data/immunization.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader).header(true).sample(false).ignoreZeroDecimal(true).build();
// Column index 3 and 7 contain values with none to 3 zero values as suffix
// Should map to type INTEGER when ignoreZeroDecimal = true
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertEquals(INTEGER, columnTypes[3]);
assertEquals(INTEGER, columnTypes[7]);
}
@Test
public void testNumberTypeDetectionRetainZeroDecimal() throws IOException {
Reader reader = new FileReader("../data/immunization.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader).header(true).sample(false).ignoreZeroDecimal(false).build();
// Column index 3 and 7 contain values with none to 3 zero values as suffix
// Should map to type DOUBLE when ignoreZeroDecimal = false
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertEquals(DOUBLE, columnTypes[3]);
assertEquals(DOUBLE, columnTypes[7]);
}
@Test
public void testMillis() {
long[] times = {1530486314124L, 1530488214124L};
LongColumn d = LongColumn.create("times", times);
DateTimeColumn column = d.asDateTimes(ZoneOffset.UTC);
assertEquals(1530486314124L, column.get(0).toInstant(ZoneOffset.UTC).toEpochMilli());
}
@Test
public void testLocalDateDetectionEnglish() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "\"Nov 1, 2017\""
+ LINE_END
+ "\"Oct 1, 2017\""
+ LINE_END
+ "\"Sep 1, 2017\""
+ LINE_END
+ "\"Aug 1, 2017\""
+ LINE_END
+ "\"Jul 1, 2017\""
+ LINE_END
+ "\"Jun 1, 2017\""
+ LINE_END);
final boolean header = true;
final char delimiter = ',';
final boolean useSampling = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.separator(delimiter)
.sample(useSampling)
.locale(Locale.ENGLISH)
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE), actual);
}
@Test
public void testDateTimeDetection() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "09-Nov-2014 13:03:04"
+ LINE_END
+ "09-Oct-2014 13:03:56"
+ LINE_END);
final boolean header = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.dateTimeFormat(DateTimeParser.caseInsensitiveFormatter("dd-MMM-yyyy HH:mm:ss"))
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE_TIME), actual);
}
@Test
public void testDateTimeDetection2() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "09-Nov-2014 13:03:04"
+ LINE_END
+ "09-Oct-2014 13:03:56"
+ LINE_END);
final boolean header = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.dateTimeFormat(DateTimeParser.caseInsensitiveFormatter("dd-MMM-yyyy HH:mm:ss"))
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE_TIME), actual);
}
@Test
public void testDateTimeDetection3() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "09-NOV-2014 13:03:04"
+ LINE_END
+ "09-OCT-2014 13:03:56"
+ LINE_END);
final boolean header = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.dateTimeFormat(
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("dd-MMM-yyyy HH:mm:ss")
.toFormatter())
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE_TIME), actual);
}
@Test
public void testDateDetection1() {
final Reader reader =
new StringReader("Time" + LINE_END + "13.03.04" + LINE_END + "13.03.04" + LINE_END);
final boolean header = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.timeFormat(
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("HH.mm.ss")
.toFormatter())
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_TIME), actual);
}
@Test
public void testTimeDetection1() {
final Reader reader =
new StringReader("Date" + LINE_END + "09-NOV-2014" + LINE_END + "09-OCT-2014" + LINE_END);
final boolean header = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.dateFormat(
new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("dd-MMM-yyyy")
.toFormatter())
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE), actual);
}
@Test
public void testLocalDateDetectionFrench() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "\"nov. 1, 2017\""
+ LINE_END
+ "\"oct. 1, 2017\""
+ LINE_END
+ "\"sept. 1, 2017\""
+ LINE_END
+ "\"ao��t 1, 2017\""
+ LINE_END
+ "\"juil. 1, 2017\""
+ LINE_END
+ "\"juin 1, 2017\""
+ LINE_END);
final boolean header = true;
final char delimiter = ',';
final boolean useSampling = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.separator(delimiter)
.sample(useSampling)
.locale(Locale.FRENCH)
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE), actual);
}
@Test
public void testLocalDateTimeDetectionFrench() {
final Reader reader =
new StringReader(
"Date"
+ LINE_END
+ "09-nov.-2014 13:03"
+ LINE_END
+ "09-oct.-2014 13:03"
+ LINE_END
+ "09-sept.-2014 13:03"
+ LINE_END
+ "09-ao��t-2014 13:03"
+ LINE_END
+ "09-juil.-2014 13:03"
+ LINE_END
+ "09-juin-2014 13:03"
+ LINE_END);
final boolean header = true;
final char delimiter = ',';
final boolean useSampling = true;
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(header)
.separator(delimiter)
.sample(useSampling)
.locale(Locale.FRENCH)
.build();
final List<ColumnType> actual = asList(new CsvReader().detectColumnTypes(reader, options));
assertEquals(Collections.singletonList(LOCAL_DATE_TIME), actual);
}
@Test
void testWithMissingValue() throws IOException {
CsvReadOptions options =
CsvReadOptions.builder("../data/missing_values.csv").missingValueIndicator("-").build();
Table t = Table.read().csv(options);
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.numberColumn(1).countMissing());
assertEquals(1, t.numberColumn(2).countMissing());
}
/**
* Tests using multiple, non-standard missing value indicators, some columns also contains NA or
* N/A, which are treated as regular string values rather than missing values. This has the
* side-effect of treating those otherwise numeric columns as StringColumns
*/
@Test
void testWithMissingValues() throws IOException {
Reader reader =
new StringReader(
"Products,Sales,Market_Share\n"
+ "a,?,-\n"
+ "b,12200,NA\n"
+ "c,60000,33\n"
+ "d,N/A,10\n"
+ ",32000,42");
CsvReadOptions options =
CsvReadOptions.builder(reader).sample(false).missingValueIndicator("-", "?", "").build();
Table t = Table.read().csv(options);
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.stringColumn(1).countMissing());
assertEquals(1, t.stringColumn(2).countMissing());
}
/** Tests the auto-detection of missing values, using multiple missing value indicators */
@Test
void testWithMissingValue2() throws IOException {
Table t = Table.read().csv("../data/missing_values2.csv");
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.numberColumn(1).countMissing());
assertEquals(1, t.numberColumn(2).countMissing());
}
@Test
public void testWindowsAndLinuxLineEndings() throws IOException {
Reader reader =
new StringReader(
"TestCol\n"
+ "foobar1\n"
+ "foobar2\n"
+ "foobar3\n"
+ "foobar4\r\n"
+ "foobar5\r\n"
+ "foobar6\r\n");
Table t = Table.read().csv(reader);
assertEquals(1, t.columnCount());
assertEquals(6, t.rowCount());
}
@Test
public void testCustomLineEndings() throws IOException {
CsvReadOptions options =
CsvReadOptions.builder("../data/alt_line_endings.csv").lineEnding("~").header(true).build();
Table t = Table.read().csv(options);
assertEquals(2, t.columnCount());
assertEquals(2, t.rowCount());
}
@Test
public void testDateWithFormatter1() throws IOException {
final boolean header = false;
final char delimiter = ',';
final boolean useSampling = true;
CsvReadOptions options =
CsvReadOptions.builder("../data/date_format_test.txt")
.header(header)
.separator(delimiter)
.sample(useSampling)
.dateFormat(DateTimeFormatter.ofPattern("yyyy.MM.dd"))
.build();
final Table table = Table.read().csv(options);
DateColumn date = table.dateColumn(0);
assertFalse(date.isEmpty());
}
@Test
public void testDateWithFormatter2() throws IOException {
final boolean header = false;
final char delimiter = ',';
final boolean useSampling = true;
CsvReadOptions options =
CsvReadOptions.builder("../data/date_format_test.txt")
.header(header)
.separator(delimiter)
.sample(useSampling)
.dateFormat(DateTimeFormatter.ofPattern("yyyy.MM.dd"))
.build();
final Table table = Table.read().csv(options);
DateColumn date = table.dateColumn(0);
assertFalse(date.isEmpty());
}
@Test
public void testPrintStructure() throws IOException {
String output =
"ColumnType[] columnTypes = {"
+ LINE_END
+ "LOCAL_DATE, // 0 date "
+ LINE_END
+ "INTEGER, // 1 approval "
+ LINE_END
+ "STRING, // 2 who "
+ LINE_END
+ "}"
+ LINE_END;
assertEquals(
output,
new CsvReader()
.printColumnTypes(
CsvReadOptions.builder("../data/bush.csv")
.header(true)
.separator(',')
.locale(Locale.getDefault())
.sample(true)
.build()));
}
@Test
public void testDataTypeDetection2() throws IOException {
Reader reader = new FileReader("../data/bush.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.sample(false)
.locale(Locale.getDefault())
.build();
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertEquals(LOCAL_DATE, columnTypes[0]);
assertEquals(INTEGER, columnTypes[1]);
assertEquals(STRING, columnTypes[2]);
}
@Test
public void testLoadFromUrlWithtypeArray() throws IOException {
ColumnType[] types = {LOCAL_DATE, DOUBLE, STRING};
Table table;
try (InputStream input = new File("../data/bush.csv").toURI().toURL().openStream()) {
table =
Table.read()
.csv(
CsvReadOptions.builder(input)
.tableName("Bush approval ratings")
.columnTypes(types));
}
assertNotNull(table);
assertEquals(3, table.columnCount());
}
/** Read from a url while performing column type inference */
@Test
public void testLoadFromUrl() throws IOException {
Table table;
try (InputStream input = new File("../data/bush.csv").toURI().toURL().openStream()) {
table = Table.read().csv(CsvReadOptions.builder(input).tableName("Bush approval ratings"));
}
assertNotNull(table);
assertEquals(3, table.columnCount());
}
/** Read from a file input stream while performing column type inference */
@Test
public void testLoadFromFileStream() throws IOException {
String location = "../data/bush.csv";
Table table;
File file = Paths.get(location).toFile();
try (InputStream input = new FileInputStream(file)) {
table = Table.read().csv(CsvReadOptions.builder(input).tableName("Bush approval ratings"));
}
assertNotNull(table);
assertEquals(3, table.columnCount());
}
/** Read from a file input stream while performing column type inference */
@Test
public void testLoadFromFileStreamReader() throws IOException {
String location = "../data/bush.csv";
Table table;
File file = Paths.get(location).toFile();
try (Reader reader = new FileReader(file)) {
table = Table.read().csv(CsvReadOptions.builder(reader).tableName("Bush approval ratings"));
}
assertNotNull(table);
assertEquals(3, table.columnCount());
}
@Test
public void testEmptyRow() throws IOException {
Table table = Table.read().csv("../data/empty_row.csv");
// Note: tried capturing std err output and asserting on it, but it failed when running as mvn
// target
assertEquals(5, table.rowCount());
}
@Test
public void testShortRow() {
assertThrows(
AddCellToColumnException.class,
() -> {
Table.read().csv("../data/short_row.csv");
});
}
@Test
public void testLongRow() {
assertThrows(
RuntimeException.class,
() -> {
Table.read().csv("../data/long_row.csv");
});
}
@Test
public void testBoundary1() throws IOException {
Table table = Table.read().csv("../data/boundaryTest1.csv");
assertEquals(2, table.rowCount());
}
@Test
public void testBoundary2() throws IOException {
Table table = Table.read().csv("../data/boundaryTest2.csv");
assertEquals(2, table.rowCount());
}
@Test
public void testReadFailure() {
// TODO (lwhite): These tests don't fail. What was their intent?
Table table1 =
Table.read()
.csv(CsvReadOptions.builder("../data/read_failure_test.csv").minimizeColumnSizes());
table1.structure(); // just make sure the import completed
ShortColumn test = table1.shortColumn("Test");
// TODO(lwhite): Better tests
assertNotNull(test.summary());
}
@Test
public void testReadFailure2() throws IOException {
Table table1 =
Table.read()
.csv(CsvReadOptions.builder("../data/read_failure_test2.csv").minimizeColumnSizes());
table1.structure(); // just make sure the import completed
ShortColumn test = table1.shortColumn("Test");
// TODO(lwhite): Better tests
assertNotNull(test.summary());
}
@Test
public void testEmptyFileHeaderEnabled() throws IOException {
Table table1 = Table.read().csv(CsvReadOptions.builder("../data/empty_file.csv").header(true));
assertEquals("empty_file.csv: 0 rows X 0 cols", table1.shape());
}
@Test
public void testEmptyFileHeaderDisabled() throws IOException {
Table table1 = Table.read().csv(CsvReadOptions.builder("../data/empty_file.csv").header(false));
assertEquals("empty_file.csv: 0 rows X 0 cols", table1.shape());
}
@Test
public void testReadMaxColumnsExceeded() {
assertThrows(
TextParsingException.class,
() -> {
Table.read().csv(CsvReadOptions.builder("../data/10001_columns.csv").header(false));
});
}
@Test
public void testReadWithMaxColumnsSetting() throws IOException {
Table table1 =
Table.read()
.csv(
CsvReadOptions.builder("../data/10001_columns.csv")
.maxNumberOfColumns(10001)
.header(false));
assertEquals("10001_columns.csv: 1 rows X 10001 cols", table1.shape());
}
@Test
public void testSkipLinesWithComments() throws IOException {
Table table1 =
Table.read()
.csv(
CsvReadOptions.builder("../data/with_comments.csv")
.maxNumberOfColumns(3)
.commentPrefix('#')
.header(true));
assertEquals("with_comments.csv: 3 rows X 3 cols", table1.shape());
}
@Test
public void carriageReturnLineEnding() throws IOException {
Table table =
Table.read().csv(CsvReadOptions.builder("../data/sacramento_real_estate_transactions.csv"));
assertEquals(985, table.rowCount());
}
@Test
public void testReadCsvWithRowSampling() throws IOException {
int expectedSampleSize = 10;
Table table =
Table.read().csv(CsvReadOptions.builder("../data/cake.csv").sampleSize(expectedSampleSize));
assertEquals(expectedSampleSize, table.rowCount());
assertEquals("[recipe, mix, temp, y]", table.columnNames().toString());
}
@Test
public void testReadCsvWithRowSamplingParsingNumbers() throws IOException {
StringBuilder csv = new StringBuilder();
csv.append("RandomNumbers\n");
Set<Integer> values =
new TreeSet<>(
asList(
24323, 542345, 64323, 73640, 38453, 12735, 93456, 23457, 483075, 469364, 473936));
values.forEach(v -> csv.append(v + "\n"));
Reader reader = new StringReader(csv.toString());
int expectedSampleSize = 5;
Table t = Table.read().csv(CsvReadOptions.builder(reader).sampleSize(expectedSampleSize));
assertEquals(1, t.columnCount());
assertEquals(expectedSampleSize, t.rowCount());
assertEquals(INTEGER, t.column(0).type());
List<Integer> intValues = t.intColumn(0).asList();
assertEquals(true, values.containsAll(intValues));
}
@Test
public void preserveQuote() throws IOException {
Table table = Table.create("test", StringColumn.create("colName", Arrays.asList("\"")));
// test CSV writes quote properly
Writer writer = new StringWriter();
table.write().csv(writer);
String string = writer.toString();
// test CSV reads quote back again
Table out = Table.read().csv(new StringReader(string));
assertEquals(table.get(0, 0), out.get(0, 0));
}
@Test
public void testReadCsvWithPercentage1() throws IOException {
Table table = Table.read().csv(CsvReadOptions.builder("../data/currency_percent.csv"));
assertEquals(DoubleColumnType.instance(), table.typeArray()[1]);
assertEquals(DoubleColumnType.instance(), table.typeArray()[2]);
}
@Test
public void testReadCsvWithPercentage2() throws IOException {
Table table = Table.read().csv(CsvReadOptions.builder("../data/currency_percent.csv"));
DoubleColumn column = (DoubleColumn) table.column(1);
assertEquals("0.0132", column.getString(0));
assertEquals("0.32768", column.getString(1));
assertEquals("1", column.getString(2));
column.setPrintFormatter(NumberColumnFormatter.percent(2));
assertEquals("1.32%", column.getString(0));
assertEquals("32.77%", column.getString(1));
assertEquals("100.00%", column.getString(2));
}
@Test
public void testSkipRowsWithInvalidColumnCount() throws IOException {
Table table =
Table.read()
.csv(
CsvReadOptions.builder("../data/short_row.csv")
.skipRowsWithInvalidColumnCount(true)
.build());
assertEquals(2, table.rowCount());
}
@Test
public void skipRowsWithInvalidColumnCountWithoutHeader() throws IOException {
assertThrows(
AddCellToColumnException.class,
() -> {
Table.read()
.csv(
CsvReadOptions.builder("../data/short_row.csv")
.header(false)
.skipRowsWithInvalidColumnCount(true)
.build());
});
}
@Test
public void testCustomizedColumnTypesMixedWithDetection() throws IOException {
Reader reader = new FileReader("../data/bus_stop_test.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.locale(Locale.getDefault())
.minimizeColumnSizes()
.columnTypesPartial(
columnName ->
Optional.ofNullable(
ImmutableMap.of("stop_id", STRING, "stop_name", STRING, "stop_lon", DOUBLE)
.get(columnName)))
.build();
ColumnType[] columnTypes = new CsvReader().read(options).typeArray();
ColumnType[] expectedTypes = Arrays.copyOf(bus_types, bus_types.length);
expectedTypes[0] = STRING; // stop_id
expectedTypes[1] = STRING; // stop_name
expectedTypes[4] = DOUBLE; // stop_lon
assertArrayEquals(expectedTypes, columnTypes);
}
@Test
public void testCustomizedColumnTypeAllCustomized() throws IOException {
Reader reader = new FileReader("../data/bus_stop_test.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.locale(Locale.getDefault())
.minimizeColumnSizes()
.columnTypes(columnName -> STRING)
.build();
ColumnType[] columnTypes = new CsvReader().read(options).typeArray();
assertTrue(Arrays.stream(columnTypes).allMatch(columnType -> columnType.equals(STRING)));
}
@Test
public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvided() throws IOException {
Reader reader = new FileReader("../data/bus_stop_test_no_data.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.locale(Locale.getDefault())
.minimizeColumnSizes()
.columnTypesPartial(
ImmutableMap.of(
"stop_id",
SHORT,
"stop_name",
STRING,
"stop_desc",
STRING,
"stop_lat",
FLOAT,
"stop_lon",
FLOAT))
.build();
ColumnType[] columnTypes = new CsvReader().read(options).typeArray();
assertArrayEquals(bus_types, columnTypes);
}
@Test
public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvided2() throws IOException {
Reader reader = new FileReader("../data/bus_stop_test_no_data.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.locale(Locale.getDefault())
.minimizeColumnSizes()
.columnTypes(new ColumnType[] {SHORT, STRING, STRING, FLOAT, FLOAT})
.build();
ColumnType[] columnTypes = new CsvReader().read(options).typeArray();
assertArrayEquals(bus_types, columnTypes);
}
@Test
public void testColumnsArePreservedWithNoDataIfCustomizedTypesAreProvidedPartially()
throws IOException {
Reader reader = new FileReader("../data/bus_stop_test_no_data.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader)
.header(true)
.separator(',')
.locale(Locale.getDefault())
.minimizeColumnSizes()
.columnTypesPartial(ImmutableMap.of("stop_id", SHORT, "stop_name", STRING))
.build();
ColumnType[] columnTypes = new CsvReader().read(options).typeArray();
assertArrayEquals(new ColumnType[] {SHORT, STRING}, columnTypes);
}
}