Github_154.java

/*******************************************************************************
 * Copyright 2017 Univocity Software Pty Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.univocity.parsers.issues.github;

import com.univocity.parsers.annotations.*;
import com.univocity.parsers.common.processor.*;
import com.univocity.parsers.csv.*;
import org.testng.annotations.*;

import java.io.*;
import java.nio.charset.*;
import java.util.*;

import static com.univocity.parsers.common.ArgumentUtils.*;
import static com.univocity.parsers.common.input.BomInput.*;
import static org.testng.Assert.*;

/**
 * From: https://github.com/univocity/univocity-parsers/issues/154
 *
 * @author Univocity Software Pty Ltd - <a href="mailto:dev@univocity.com">dev@univocity.com</a>
 */
public class Github_154 {

	public static class User {
		@Parsed(field = "Email")
		private String email;
	}

	private static final String INPUT = "Email\ndev@univocity.com";

	private static byte[] getInput(String encoding) {
		return INPUT.getBytes(Charset.forName(encoding));
	}

	@DataProvider
	Object[][] getFileAndEncoding() {
		return new Object[][]{
				{true, "UTF-8", null},
				{false, "UTF-8", null},
				{true, "UTF-8", UTF_8_BOM},
				{false, "UTF-8", UTF_8_BOM},

				{true, "UTF-16BE", UTF_16BE_BOM},
				{false, "UTF-16BE", UTF_16BE_BOM},

				{true, "UTF-16LE", UTF_16LE_BOM},
				{false, "UTF-16LE", UTF_16LE_BOM},

				//edge case here. Looks like UTF-32LE until the last character.
				{true, "UTF-16LE", toByteArray(0xFF, 0xFE, 0x00, ' ')},

				{true, "UTF-32BE", UTF_32BE_BOM},
				{false, "UTF-32BE", UTF_32BE_BOM},

				{true, "UTF-32LE", UTF_32LE_BOM},
				{false, "UTF-32LE", UTF_32LE_BOM},
		};
	}

	@Test(dataProvider = "getFileAndEncoding")
	public void readWithBom(boolean extractFromBom, String encoding, byte[] prepend) {
		final CsvParserSettings parserSettings = new CsvParserSettings();
		final BeanListProcessor<User> rowProcessor = new BeanListProcessor<User>(User.class);

		parserSettings.setProcessor(rowProcessor);
		parserSettings.setLineSeparatorDetectionEnabled(true);
		parserSettings.setHeaderExtractionEnabled(true);
		parserSettings.setSkipEmptyLines(false);
		parserSettings.setReadInputOnSeparateThread(false);

		final CsvParser parser = new CsvParser(parserSettings);

		byte[] bytes = getInput(encoding);
		if (extractFromBom) {
			encoding = null;
		}
		if (prepend != null) {
			byte[] newBytes = new byte[bytes.length + prepend.length];
			System.arraycopy(prepend, 0, newBytes, 0, prepend.length);
			System.arraycopy(bytes, 0, newBytes, prepend.length, bytes.length);

			bytes = newBytes;
		}
		parser.beginParsing(new ByteArrayInputStream(bytes), encoding);
		String[] row = parser.parseNext();
		parser.stopParsing();

		if(prepend != null && prepend[prepend.length -1] == ' '){
			assertEquals(parser.getContext().headers()[0], "���Email");
			assertEquals(row[0], "dev@univocity.com");

		} else {
			assertEquals(parser.getContext().headers()[0], "Email");
			assertEquals(row[0], "dev@univocity.com");
			final List<User> actual = rowProcessor.getBeans();
			assertEquals(actual.get(0).email, "dev@univocity.com");
		}
	}
}