BomUtil.java
package de.siegmar.fastcsv.reader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
@SuppressWarnings("checkstyle:MagicNumber")
final class BomUtil {
/**
* The maximum number of bytes a BOM header can have.
*/
static final int POTENTIAL_BOM_SIZE = 4;
private BomUtil() {
}
/**
* Detects the character encoding of a byte array based on the presence of a Byte Order Mark (BOM) header.
* The method supports the following BOM headers:
* <ul>
* <li>UTF-8 : EF BB BF</li>
* <li>UTF-16 BE: FE FF</li>
* <li>UTF-16 LE: FF FE</li>
* <li>UTF-32 BE: 00 00 FE FF</li>
* <li>UTF-32 LE: FF FE 00 00</li>
* </ul>
* <p>
* See <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Byte order mark</a>
*
* @param buf the byte array to detect the character encoding from
* @return an Optional containing the detected BomHeader if a BOM header is found,
* or an empty Optional if no BOM header is found
*/
@SuppressWarnings({
"checkstyle:CyclomaticComplexity",
"checkstyle:BooleanExpressionComplexity",
"checkstyle:NestedIfDepth",
"checkstyle:ReturnCount",
"PMD.AvoidLiteralsInIfCondition"
})
static Optional<BomHeader> detectCharset(final byte[] buf) {
final int n = buf.length;
if (n < 2) {
// Not enough bytes to be a BOM header
return Optional.empty();
}
if (buf[0] == (byte) 0xEF) {
if (n > 2 && buf[1] == (byte) 0xBB && buf[2] == (byte) 0xBF) {
return Optional.of(BomHeader.UTF_8);
}
} else if (buf[0] == (byte) 0xFE) {
if (buf[1] == (byte) 0xFF) {
return Optional.of(BomHeader.UTF_16_BE);
}
} else if (buf[0] == (byte) 0xFF) {
if (buf[1] == (byte) 0xFE) {
if (n > 3 && buf[2] == (byte) 0x00 && buf[3] == (byte) 0x00) {
return Optional.of(BomHeader.UTF_32_LE);
} else {
return Optional.of(BomHeader.UTF_16_LE);
}
}
} else if (buf[0] == (byte) 0x00) {
if (n > 3
&& buf[1] == (byte) 0x00
&& buf[2] == (byte) 0xFE
&& buf[3] == (byte) 0xFF) {
return Optional.of(BomHeader.UTF_32_BE);
}
}
return Optional.empty();
}
/**
* Detects the character encoding of a file based on the presence of a Byte Order Mark (BOM) header.
*
* @param file the file to detect the character encoding from
* @return an Optional containing the detected BomHeader if a BOM header is found,
* or an empty Optional if no BOM header is found
* @throws IOException if an I/O error occurs reading the file
*/
static Optional<BomHeader> detectCharset(final Path file)
throws IOException {
try (var in = Files.newInputStream(file, StandardOpenOption.READ)) {
return detectCharset(in.readNBytes(BomUtil.POTENTIAL_BOM_SIZE));
}
}
/**
* Opens a Reader for the given file, skipping a BOM header if present.
* If no BOM header is present, the defaultCharset is used.
*
* @param file the file to open a Reader for
* @param defaultCharset the default charset to use if no BOM header is present
* @return a Reader for the given file
* @throws IOException if an I/O error occurs opening the file
*/
static Reader openReader(final Path file, final Charset defaultCharset) throws IOException {
final var bomHeader = detectCharset(file);
final var in = Files.newInputStream(file);
// No BOM header found
if (bomHeader.isEmpty()) {
return new InputStreamReader(in, defaultCharset);
}
// Return reader with skipped BOM header
final int bomLength = bomHeader.get().getLength();
if (in.skip(bomLength) != bomLength) {
throw new IOException("Failed to skip BOM header");
}
return new InputStreamReader(in, bomHeader.get().getCharset());
}
}