BomUtil.java
package de.siegmar.fastcsv.reader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
@SuppressWarnings("checkstyle:MagicNumber")
final class BomUtil {
/// The maximum number of bytes a BOM header can have.
static final int POTENTIAL_BOM_SIZE = 4;
private BomUtil() {
}
/// Detects the character encoding of a byte array based on the presence of a Byte Order Mark (BOM) header.
/// The method supports the following BOM headers:
///
/// | Encoding | BOM |
/// |-----------|-------------|
/// | UTF-8 | EF BB BF |
/// | UTF-16 BE | FE FF |
/// | UTF-16 LE | FF FE |
/// | UTF-32 BE | 00 00 FE FF |
/// | UTF-32 LE | FF FE 00 00 |
///
/// See <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Byte order mark</a>.
///
/// @param buf the byte array to detect the character encoding from
/// @return an Optional containing the detected BomHeader if a BOM header is found,
/// or an empty Optional if no BOM header is found
@SuppressWarnings({
"checkstyle:CyclomaticComplexity",
"checkstyle:BooleanExpressionComplexity",
"checkstyle:NestedIfDepth",
"checkstyle:ReturnCount",
"PMD.AvoidLiteralsInIfCondition"
})
static Optional<BomHeader> detectCharset(final byte[] buf) {
final int n = buf.length;
if (n < 2) {
// Not enough bytes to be a BOM header
return Optional.empty();
}
if (buf[0] == (byte) 0xEF) {
if (n > 2 && buf[1] == (byte) 0xBB && buf[2] == (byte) 0xBF) {
return Optional.of(BomHeader.UTF_8);
}
} else if (buf[0] == (byte) 0xFE) {
if (buf[1] == (byte) 0xFF) {
return Optional.of(BomHeader.UTF_16_BE);
}
} else if (buf[0] == (byte) 0xFF) {
if (buf[1] == (byte) 0xFE) {
if (n > 3 && buf[2] == (byte) 0x00 && buf[3] == (byte) 0x00) {
return Optional.of(BomHeader.UTF_32_LE);
} else {
return Optional.of(BomHeader.UTF_16_LE);
}
}
} else if (buf[0] == (byte) 0x00) {
if (n > 3
&& buf[1] == (byte) 0x00
&& buf[2] == (byte) 0xFE
&& buf[3] == (byte) 0xFF) {
return Optional.of(BomHeader.UTF_32_BE);
}
}
return Optional.empty();
}
/// Detects the character encoding of a file based on the presence of a Byte Order Mark (BOM) header.
///
/// @param file the file to detect the character encoding from
/// @return an [Optional] containing the detected [BomHeader] if a BOM header is found,
/// or [Optional#EMPTY] if no BOM header is found
/// @throws IOException if an I/O error occurs reading the file
static Optional<BomHeader> detectCharset(final Path file)
throws IOException {
try (var in = Files.newInputStream(file, StandardOpenOption.READ)) {
return detectCharset(in.readNBytes(POTENTIAL_BOM_SIZE));
}
}
}