TsvParserTest.java
/*******************************************************************************
* Copyright 2014 Univocity Software Pty Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.univocity.parsers.tsv;
import com.univocity.parsers.*;
import com.univocity.parsers.common.*;
import com.univocity.parsers.common.processor.*;
import com.univocity.parsers.csv.*;
import org.testng.annotations.*;
import java.io.*;
import java.util.*;
import static org.testng.Assert.*;
public class TsvParserTest extends ParserTestCase {
@DataProvider(name = "tsvProvider")
public Object[][] tsvProvider() {
return new Object[][]{
{"/tsv/essential.tsv", new char[]{'\n'}},
{"/tsv/essential-dos.tsv", new char[]{'\r', '\n'}},
{"/tsv/essential.tsv", null},
{"/tsv/essential-dos.tsv", null},
{"/tsv/essential-mac.tsv", null}
};
}
@Test(enabled = true, dataProvider = "tsvProvider")
public void parseIgnoringWhitespaces(String tsvFile, char[] lineSeparator) throws Exception {
TsvParserSettings settings = newTsvInputSettings(lineSeparator);
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(true);
settings.setIgnoreLeadingWhitespaces(true);
settings.setIgnoreTrailingWhitespaces(true);
TsvParser parser = new TsvParser(settings);
parser.parse(newReader(tsvFile));
String[] expectedHeaders = new String[]{"Year", "Make", "Model", "Description", "Price"};
String[][] expectedResult = new String[][]{
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1999", "Chevy", "Venture \"Extended Edition\"", null, "4900.00"},
{"1996", "Jeep", "Grand Cherokee", "MUST SELL!\nair, moon roof, loaded", "4799.00"},
{"1999", "Chevy", "Venture \"Extended Edition, Very Large\"", null, "5000.00"},
{null, null, "Venture \"Extended Edition\"", null, "4900.00"},
{null, null, null, null, null},
{null, null, null, null, null},
{null, null, "5", null, null},
{"1997", "Ford", "E350", "ac, abs, moon", "\"3000.00\""},
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"19 97", "Fo rd", "E350", "ac, abs, moon", "3000.00"},
{null, null, null, "\" \"", "30 00.00"},
{"1997", "Ford", "E350", "\" ac, abs, moon \"", "3000.00"},
{"1997", "Ford", "E350", "\" ac, abs, moon \"", "3000.00"},
};
assertHeadersAndValuesMatch(expectedHeaders, expectedResult);
}
protected TsvParserSettings newTsvInputSettings(char[] lineSeparator) {
TsvParserSettings out = new TsvParserSettings();
if (lineSeparator == null) {
out.setLineSeparatorDetectionEnabled(true);
} else {
out.getFormat().setLineSeparator(lineSeparator);
}
return out;
}
@Test(enabled = true, dataProvider = "tsvProvider")
public void parseUsingWhitespaces(String tsvFile, char[] lineSeparator) throws Exception {
TsvParserSettings settings = newTsvInputSettings(lineSeparator);
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(true);
settings.setNullValue("?????");
settings.setIgnoreLeadingWhitespaces(false);
settings.setIgnoreTrailingWhitespaces(false);
TsvParser parser = new TsvParser(settings);
parser.parse(newReader(tsvFile));
String[] expectedHeaders = new String[]{"Year", "Make", "Model", "Description", "Price"};
String[][] expectedResult = new String[][]{
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1999", "Chevy", "Venture \"Extended Edition\"", "?????", "4900.00"},
{"1996", "Jeep", "Grand Cherokee", "MUST SELL!\nair, moon roof, loaded", "4799.00"},
{"1999", "Chevy", "Venture \"Extended Edition, Very Large\"", "?????", "5000.00"},
{"?????", "?????", "Venture \"Extended Edition\"", "?????", "4900.00"},
{"?????", "?????", "?????", "?????", "?????"},
{" ", " ", " ", " ", " "},
{"?????", "?????", " 5 ", "?????", "?????"},
{" "},
{"1997 ", " Ford ", "E350", "ac, abs, moon\t", " \"3000.00\" \t"},
{"1997", " Ford ", "E350", " ac, abs, moon \t", "3000.00 \t"},
{" 1997", " Ford ", "E350", " ac, abs, moon \t", "3000.00"},
{" 19 97 ", " Fo rd ", "E350", " ac, abs, moon \t", "3000.00"},
{"\t\t", " ", " ", " \" \"\t", "30 00.00\t"},
{"1997", "Ford", "E350", " \" ac, abs, moon \" ", "3000.00"},
{"1997", "Ford", "E350", "\" ac, abs, moon \" ", "3000.00"},
};
assertHeadersAndValuesMatch(expectedHeaders, expectedResult);
}
@Test(enabled = true, dataProvider = "tsvProvider")
public void parseColumns(String tsvFile, char[] lineSeparator) throws Exception {
TsvParserSettings settings = newTsvInputSettings(lineSeparator);
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(true);
settings.setIgnoreLeadingWhitespaces(true);
settings.setIgnoreTrailingWhitespaces(true);
settings.selectFields("Year");
settings.setColumnReorderingEnabled(false);
TsvParser parser = new TsvParser(settings);
parser.parse(newReader(tsvFile));
String[] expectedHeaders = new String[]{"Year", "Make", "Model", "Description", "Price"};
String[][] expectedResult = new String[][]{
{"1997", null, null, null, null},
{"1999", null, null, null, null},
{"1996", null, null, null, null},
{"1999", null, null, null, null},
{null, null, null, null, null},
{null, null, null, null, null},
{null, null, null, null, null},
{null, null, null, null, null},
{"1997", null, null, null, null},
{"1997", null, null, null, null},
{"1997", null, null, null, null},
{"19 97", null, null, null, null},
{null, null, null, null, null},
{"1997", null, null, null, null},
{"1997", null, null, null, null},
};
assertHeadersAndValuesMatch(expectedHeaders, expectedResult);
}
private String[] process(String input, Integer[] indexesToExclude, Integer[] indexesToSelect, String[] fieldsToExclude, String[] fieldsToSelect) {
RowListProcessor processor = new RowListProcessor();
StringReader reader = new StringReader(input);
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(fieldsToExclude != null || fieldsToSelect != null);
if (indexesToExclude != null) {
settings.excludeIndexes(indexesToExclude);
} else if (fieldsToExclude != null) {
settings.excludeFields(fieldsToExclude);
} else if (indexesToSelect != null) {
settings.selectIndexes(indexesToSelect);
} else if (fieldsToSelect != null) {
settings.selectFields(fieldsToSelect);
}
TsvParser parser = new TsvParser(settings);
parser.parse(reader);
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
return rows.get(0);
}
@Test(enabled = true)
public void columnSelectionTest() {
String[] result;
String input = "a b c d e";
Integer[] indexesToExclude = new Integer[]{0, 4};
result = process(input, indexesToExclude, null, null, null);
assertEquals(result, new String[]{"b", "c", "d"});
Integer[] indexesToSelect = new Integer[]{0, 4};
result = process(input, null, indexesToSelect, null, null);
assertEquals(result, new String[]{"a", "e"});
input = "ha hb hc hd he\na b c d e";
String[] fieldsToExclude = new String[]{"hb", "hd"};
result = process(input, null, null, fieldsToExclude, null);
assertEquals(result, new String[]{"a", "c", "e"});
String[] fieldsToSelect = new String[]{"hb", "hd"};
result = process(input, null, null, null, fieldsToSelect);
assertEquals(result, new String[]{"b", "d"});
}
@Override
protected RowListProcessor newRowListProcessor() {
return new RowListProcessor() {
@Override
public void processStarted(ParsingContext context) {
super.processStarted(context);
context.skipLines(2);
}
@Override
public void rowProcessed(String[] row, ParsingContext context) {
super.rowProcessed(row, context);
// for (int i = 0; i < row.length; i++) {
// row[i] = ">>" + row[i] + "<<";
// }
// System.out.println(context.currentLine() + " => " + Arrays.toString(row));
if (context.currentLine() == 8) {
context.skipLines(1);
}
}
};
}
@Test(enabled = true, dataProvider = "tsvProvider")
public void parseOneByOne(String tsvFile, char[] lineSeparator) throws Exception {
TsvParserSettings settings = newTsvInputSettings(lineSeparator);
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(true);
settings.setIgnoreLeadingWhitespaces(true);
settings.setIgnoreTrailingWhitespaces(true);
settings.setHeaders("YR", "MK", "MDL", "DSC", "PRC");
List<Object[]> results = new ArrayList<Object[]>();
TsvParser parser = new TsvParser(settings);
try {
parser.beginParsing(newReader(tsvFile));
Object[] row = null;
while ((row = parser.parseNext()) != null) {
if (row.length == 5) {
results.add(row);
}
}
} finally {
parser.stopParsing();
}
String[] expectedHeaders = new String[]{"YR", "MK", "MDL", "DSC", "PRC"};
String[][] expectedResult = new String[][]{
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1999", "Chevy", "Venture \"Extended Edition\"", null, "4900.00"},
{"1996", "Jeep", "Grand Cherokee", "MUST SELL!\nair, moon roof, loaded", "4799.00"},
{"1999", "Chevy", "Venture \"Extended Edition, Very Large\"", null, "5000.00"},
{null, null, "Venture \"Extended Edition\"", null, "4900.00"},
{null, null, null, null, null},
{null, null, null, null, null},
{null, null, "5", null, null},
{"1997", "Ford", "E350", "ac, abs, moon", "\"3000.00\""},
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"19 97", "Fo rd", "E350", "ac, abs, moon", "3000.00"},
{null, null, null, "\" \"", "30 00.00"},
{"1997", "Ford", "E350", "\" ac, abs, moon \"", "3000.00"},
{"1997", "Ford", "E350", "\" ac, abs, moon \"", "3000.00"},
};
Object[] headers = processor.getHeaders();
TestUtils.assertEquals(headers, expectedHeaders);
assertEquals(results.size(), expectedResult.length);
for (int i = 0; i < expectedResult.length; i++) {
Object[] result = results.get(i);
String[] expectedRow = expectedResult[i];
assertEquals(result, expectedRow);
}
}
@Test(enabled = true, dataProvider = "tsvProvider")
public void parse3Records(String tsvFile, char[] lineSeparator) throws Exception {
TsvParserSettings settings = newTsvInputSettings(lineSeparator);
settings.setRowProcessor(processor);
settings.setHeaderExtractionEnabled(true);
settings.setIgnoreLeadingWhitespaces(true);
settings.setIgnoreTrailingWhitespaces(true);
settings.setNumberOfRecordsToRead(3);
TsvParser parser = new TsvParser(settings);
parser.parse(newReader(tsvFile));
String[] expectedHeaders = new String[]{"Year", "Make", "Model", "Description", "Price"};
String[][] expectedResult = new String[][]{
{"1997", "Ford", "E350", "ac, abs, moon", "3000.00"},
{"1999", "Chevy", "Venture \"Extended Edition\"", null, "4900.00"},
{"1996", "Jeep", "Grand Cherokee", "MUST SELL!\nair, moon roof, loaded", "4799.00"},
};
assertHeadersAndValuesMatch(expectedHeaders, expectedResult);
}
@Test
public void parseWithLineJoining() {
TsvParserSettings settings = new TsvParserSettings();
settings.setLineJoiningEnabled(true);
settings.getFormat().setLineSeparator("\n");
settings.trimValues(false);
TsvParser parser = new TsvParser(settings);
List<String[]> result = parser.parseAll(new StringReader("A B \\\nC\n" +
"1 2 \\\n" +
"3\\\\"));
assertEquals(result.get(0), new String[]{"A", "B", "\nC"});
assertEquals(result.get(1), new String[]{"1", "2", "\n3\\"});
}
@Test
public void parseIgnoreTrailingWhitespaceAppendSlash() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("\\\\"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] firstRow = rows.get(0);
assertEquals(firstRow[0], "\\");
}
@Test
public void parseIgnoreTrailingWhitespaceAppendBreakLineR() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a\\r"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] firstRow = rows.get(0);
assertEquals(firstRow[0], "a");
}
@Test
public void parseIgnoreTrailingWhitespaceJoinLines() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(true);
settings.setLineJoiningEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a\\\nb"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] firstRow = rows.get(0);
assertEquals(firstRow[0], "a\nb");
}
@Test
public void parseIgnoreTrailingWhitespaceEscapeTab() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(true);
settings.setLineJoiningEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a\\\tb"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] row = rows.get(0);
assertEquals(row.length, 2);
assertEquals(row[0], "a\\");
assertEquals(row[1], "b");
}
@Test
public void parseIgnoreTrailingWhitespaceEscapeOther() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.getFormat().setLineSeparator("\n");
settings.setIgnoreTrailingWhitespaces(true);
settings.setLineJoiningEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a \\\bb"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] row = rows.get(0);
assertEquals(row.length, 1);
assertEquals(row[0], "a \\\bb");
}
@Test
public void parseNotIgnoreTrailingWhitespaceAppendBreakLineR() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(false);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a \\r"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] row = rows.get(0);
assertEquals(row.length, 1);
assertEquals(row[0], "a \r");
}
@Test
public void parseNotIgnoreTrailingWhitespaceEscapeTab() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(false);
settings.setLineJoiningEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a \\\tb"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] row = rows.get(0);
assertEquals(row.length, 2);
assertEquals(row[0], "a \\");
assertEquals(row[1], "b");
}
@Test
public void parseNotIgnoreTrailingWhitespaceEscapeOther() {
RowListProcessor processor = new RowListProcessor();
TsvParserSettings settings = new TsvParserSettings();
settings.setRowProcessor(processor);
settings.setIgnoreTrailingWhitespaces(false);
settings.setLineJoiningEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.parse(new StringReader("a \\\bb"));
List<String[]> rows = processor.getRows();
assertEquals(rows.size(), 1);
String[] row = rows.get(0);
assertEquals(row.length, 1);
assertEquals(row[0], "a \\\bb");
}
@Test
public void testFieldSelectionWithMismatchingNames() {
String input = "" +
"h1\th2\th3\n" +
"1\t2\t3\n" +
"4\t5\t6";
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.selectFields("h2", "h3", "h9", "h8");
settings.setHeaderExtractionEnabled(true);
TsvParser parser = new TsvParser(settings);
List<String[]> rows = parser.parseAll(new StringReader(input));
assertEquals(rows.get(0)[0], "2");
assertEquals(rows.get(0)[1], "3");
assertEquals(rows.get(0)[2], null);
assertEquals(rows.get(0)[3], null);
assertEquals(rows.get(0).length, 4);
assertEquals(rows.get(1)[0], "5");
assertEquals(rows.get(1)[1], "6");
assertEquals(rows.get(1)[2], null);
assertEquals(rows.get(1)[3], null);
assertEquals(rows.get(1).length, 4);
assertEquals(rows.size(), 2);
}
@Test
public void parseWithAutoExpansion() {
TsvParserSettings settings = new TsvParserSettings();
settings.setMaxCharsPerColumn(-1);
StringBuilder in = new StringBuilder(100000);
for(int i = 0; i < 100000; i++){
in.append(i % 10);
if(i % 10000 == 0){
in.append('\t');
}
}
String[] result = new TsvParser(settings).parseLine(in.toString());
StringBuilder out = new StringBuilder();
for(String value : result){
if(out.length() > 0){
out.append('\t');
}
out.append(value);
}
assertEquals(out.toString(), in.toString());
}
@Test
public void testBitsAreNotDiscardedWhenParsing() {
TsvParserSettings parserSettings = new TsvParserSettings();
parserSettings.setSkipBitsAsWhitespace(false);
TsvParser parser = new TsvParser(parserSettings);
String[] line;
line = parser.parseLine("\0 a\tb");
assertEquals(line.length, 2);
assertEquals(line[0], "\0 a");
assertEquals(line[1], "b");
line = parser.parseLine("\1 a\t b \0");
assertEquals(line.length, 2);
assertEquals(line[0], "\1 a");
assertEquals(line[1], "b \0");
line = parser.parseLine("\2 a\t b\\t \1 ");
assertEquals(line.length, 2);
assertEquals(line[0], "a");
assertEquals(line[1], "b\t \1");
}
}