BOMDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collections;
import java.util.List;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
/**
* Encoding detector that identifies the character set from a byte-order mark
* (BOM) at the start of the stream. Returns a single {@link EncodingResult.ResultType#DECLARATIVE}
* result when a BOM is found ��� a BOM is an explicit in-band declaration of encoding
* and takes priority over all statistical or structural inference.
*
* <p>SPI-loaded first in the default encoding-detector chain so that BOM evidence
* reaches {@code CharSoupEncodingDetector} before any statistical detector runs.
* {@code MojibusterEncodingDetector} strips the BOM from its own probe independently
* to ensure consistent model inference (BOMs are excluded from training data).</p>
*
* @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0)
*/
@TikaComponent
public class BOMDetector implements EncodingDetector {
private static final ByteOrderMark[] BOMS =
//order matters -- have to try the 32 before the 16
new ByteOrderMark[] {
ByteOrderMark.UTF_8,
ByteOrderMark.UTF_32BE,
ByteOrderMark.UTF_32LE,
ByteOrderMark.UTF_16BE,
ByteOrderMark.UTF_16LE
};
private static final Charset[] CHARSETS = new Charset[BOMS.length];
private static final int MIN_BYTES = 2;
private static final int MAX_BYTES = 4;
static {
for (int i = 0; i < BOMS.length; i++) {
try {
CHARSETS[i] = Charset.forName(BOMS[i].getCharsetName());
} catch (UnsupportedCharsetException e) {
//log it
}
}
}
@Override
public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
ParseContext parseContext) throws IOException {
tis.mark(MAX_BYTES);
byte[] bytes = new byte[MAX_BYTES];
try {
int numRead = IOUtils.read(tis, bytes);
if (numRead < MIN_BYTES) {
return Collections.emptyList();
} else if (numRead < MAX_BYTES) {
byte[] tmpBytes = new byte[numRead];
System.arraycopy(bytes, 0, tmpBytes, 0, numRead);
bytes = tmpBytes;
}
} finally {
tis.reset();
}
for (int i = 0; i < BOMS.length; i++) {
ByteOrderMark bom = BOMS[i];
if (startsWith(bom, bytes) && CHARSETS[i] != null) {
return List.of(new EncodingResult(CHARSETS[i], 1.0f,
CHARSETS[i].name(), EncodingResult.ResultType.DECLARATIVE));
}
}
return Collections.emptyList();
}
private boolean startsWith(ByteOrderMark bom, byte[] bytes) {
byte[] bomBytes = bom.getBytes();
if (bytes.length < bomBytes.length) {
return false;
}
for (int i = 0; i < bomBytes.length; i++) {
if (bomBytes[i] != bytes[i]) {
return false;
}
}
return true;
}
}