CsvFormatDetectorTest.java

/*******************************************************************************
 * Copyright 2015 Univocity Software Pty Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.univocity.parsers.csv;

import org.testng.annotations.*;

import java.io.*;
import java.util.*;

import static org.testng.Assert.*;

public class CsvFormatDetectorTest {

	@DataProvider
	public Object[][] getInputsAndOutputs() {
		return new Object[][]{
				{"A,B,C\n1,2,3\n1,2,3\n1,2,3",
						Arrays.asList(new String[]{"A", "B", "C"}, new String[]{"1", "2", "3"}, new String[]{"1", "2", "3"}, new String[]{"1", "2", "3"})},
				{"\"A\";'B';\"C\"\n\"1\\\" and \\\"2\";\"3\\\"A\";'B';\"C\"\n\"A\";'B';\"C\"\n\"A\";'B';\"C\"\n",
						Arrays.asList(new String[]{"A", "'B'", "C"}, new String[]{"1\" and \"2", "3\"A", "'B'", "C"}, new String[]{"A", "'B'", "C"}, new String[]{"A", "'B'", "C"})},
				{"\"A\";'B';\"C\"\n\"1\\\" and \\\"2\";\"3' and '4\";\"5\\\" and \\\"6\"\n\"A\";'B';\"C\"\n\"A\";'B';\"C\"\n",
						Arrays.asList(new String[]{"A", "'B'", "C"}, new String[]{"1\" and \"2", "3' and '4", "5\" and \"6"}, new String[]{"A", "'B'", "C"}, new String[]{"A", "'B'", "C"})},
				{"1,2;2,3;3,4;a\n1,2;2,3;3,4;b\n1,2;2,3;3,4;c\n1,2;2,3;3,4;d\n",
						Arrays.asList(new String[]{"1,2", "2,3", "3,4", "a"}, new String[]{"1,2", "2,3", "3,4", "b"}, new String[]{"1,2", "2,3", "3,4", "c"}, new String[]{"1,2", "2,3", "3,4", "d"})},
				{"A;B;C;D;E\n$1.2;$2.3;$3.4\n$1.2;$2.3;$3.4\n$1.2;$2.3;$3.4\n$1.2;$2.3;$3.4\n",
						Arrays.asList(new String[]{"A", "B", "C", "D", "E"}, new String[]{"$1.2", "$2.3", "$3.4"}, new String[]{"$1.2", "$2.3", "$3.4"}, new String[]{"$1.2", "$2.3", "$3.4"},
								new String[]{"$1.2", "$2.3", "$3.4"})},
				{"\"A'A\",\"BB\",\"CC\"\n\"11\",\"22\",\"33\"\n\"11\",\"22\",\"33\"\n\"11\",\"22\",\"33\"\n",
						Arrays.asList(new String[]{"A'A", "BB", "CC"}, new String[]{"11", "22", "33"}, new String[]{"11", "22", "33"}, new String[]{"11", "22", "33"})}

		};
	}

	private CsvParserSettings newSettings() {
		CsvParserSettings settings = new CsvParserSettings();
		settings.setDelimiterDetectionEnabled(true);
		settings.setQuoteDetectionEnabled(true);
		settings.setParseUnescapedQuotes(false);

		settings.getFormat().setLineSeparator("\n");
		settings.getFormat().setDelimiter('x');
		settings.getFormat().setQuote('x');
		settings.getFormat().setQuoteEscape('x');
		return settings;
	}

	@Test(dataProvider = "getInputsAndOutputs")
	public void testDelimiterDiscovery(String input, List<String[]> expectedOutput) {
		CsvParserSettings settings = newSettings();

		CsvParser parser = new CsvParser(settings);

		List<String[]> rows = parser.parseAll(new StringReader(input));

		assertEquals(rows.size(), expectedOutput.size());
		for (int i = 0; i < rows.size(); i++) {
			assertEquals(expectedOutput.get(i), rows.get(i));
		}
	}


	@Test
	public void testAutodetection() throws Exception {
		CsvParserSettings settings = new CsvParserSettings();
		settings.detectFormatAutomatically();
		CsvParser parser = new CsvParser(settings);

		String s = "" +
				"1;2001-01-01;First row;1.1\n" +
				"2;2002-02-02;Second row;2.2\n" +
				"3;2003-03-03;Third row;3.3\n" +
				"4;2004-04-04;Fourth row;4.4";

		List<String[]> rows = parser.parseAll(new StringReader(s));

		CsvFormat format = parser.getDetectedFormat();
		assertEquals(format.getDelimiter(), ';');
		assertEquals(rows.size(), 4);

		s = "" +
				"1;2001-01-01;First row;1.1\n" +
				"2;2002-02-02;Second row;2\n" +
				"3;2003-03-03;Third row;3.3\n" +
				"4;2004-04-04;Fourth row;4.4";

		rows = parser.parseAll(new StringReader(s));

		format = parser.getDetectedFormat();
		assertEquals(format.getDelimiter(), ';');
		assertEquals(rows.size(), 4);
	}

	@Test
	public static void testDelimitersDetectedUsingOrderOfPreference() {
		String input = "HEADER1, HEADER2, HEADER3\n" +
				"11, 12, 13\n" +
				"21, 22, 23\n" +
				"31, 32, 33\n";

		CsvParserSettings settings = new CsvParserSettings();
		settings.setDelimiterDetectionEnabled(true, ',', ' ');
		CsvParser parser = new CsvParser(settings);
		parser.parseAll(new StringReader(input));
		CsvFormat format = parser.getDetectedFormat();
		assertEquals(format.getDelimiter(), ',');
	}

	@Test
	public static void testDelimitersDetectedUsingOrderOfPreference1() {
		String input = "HEADER 1,HEADER 2,HEADER 3\n" +
						"SOME TEXT 1,SOME TEXT 2,SOME TEXT 3,";

		CsvParserSettings settings = new CsvParserSettings();
		settings.setDelimiterDetectionEnabled(true, ',', ' ');
		settings.setFormatDetectorRowSampleCount(2);
		CsvParser parser = new CsvParser(settings);
		parser.parseAll(new StringReader(input));
		CsvFormat format = parser.getDetectedFormat();
		assertEquals(format.getDelimiter(), ',');
	}
}