StandardHtmlEncodingDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html.charsetdetector;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.List;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.MetadataCharsetDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
/**
* An encoding detector that respects the HTML5 encoding-sniff algorithm
* (https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream):
* BOM ��� HTTP Content-Type header ��� {@code <meta charset>} / {@code <meta http-equiv>} tag.
*
* <p>When used standalone (outside a {@link org.apache.tika.detect.CompositeEncodingDetector}
* chain) this detector handles the full spec algorithm including BOM detection.
*
* <p>When used inside the default Tika chain (with {@code BOMDetector} and
* {@code MetadataCharsetDetector} already present), set {@code skipBOM=true} so that
* this detector focuses exclusively on the HTML {@code <meta>} scan. That lets
* {@code CharSoupEncodingDetector} arbitrate between a BOM declaration and a
* contradicting {@code <meta>} declaration instead of silently suppressing one.
*
* <p>HTTP/MIME Content-Type and Content-Encoding metadata are always read here for
* standalone compatibility; in the chain they will already have been returned by
* {@code MetadataCharsetDetector} and {@code CharSoup} will handle the duplication
* gracefully (identical DECLARATIVE results agree, so no harm done).
*/
@TikaComponent(name = "standard-html-encoding-detector")
public final class StandardHtmlEncodingDetector implements EncodingDetector {
/**
* Default number of bytes to scan for a {@code <meta charset>} declaration.
* 65536 is large enough to cover typical {@code <script>} or {@code <style>}
* blocks in the {@code <head>} without significant overhead (encoding detection
* already buffers the stream). Users who need to handle even deeper declarations
* can raise this via {@link #setMarkLimit(int)}.
*/
private static final int META_TAG_BUFFER_SIZE = 65536;
private int markLimit = META_TAG_BUFFER_SIZE;
/**
* When {@code true}, the BOM check is skipped and the detector goes directly to
* the Content-Type header and {@code <meta>} scan. Use this when
* {@code BOMDetector} is already present in the chain so that
* {@code CharSoupEncodingDetector} can arbitrate between a BOM declaration and a
* contradicting {@code <meta charset>} rather than having the BOM silently win.
*
* <p>Default: {@code false} (HTML5 spec-compliant standalone behaviour).</p>
*/
private boolean skipBOM = false;
@Override
public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
ParseContext context) throws IOException {
int limit = getMarkLimit();
tis.mark(limit);
InputStream limitedStream = BoundedInputStream.builder()
.setInputStream(tis).setMaxCount(limit).get();
PreScanner preScanner = new PreScanner(limitedStream);
Charset detectedCharset = null;
if (!skipBOM) {
// HTML5 spec: BOM overrides everything. When used standalone this
// detector is responsible for BOM detection; when used in the chain with
// BOMDetector, setting skipBOM=true lets CharSoup arbitrate.
detectedCharset = preScanner.detectBOM();
}
if (detectedCharset == null) {
detectedCharset = MetadataCharsetDetector.charsetFromContentType(metadata);
}
if (detectedCharset == null) {
detectedCharset = MetadataCharsetDetector.charsetFromContentEncoding(metadata);
}
if (detectedCharset == null) {
detectedCharset = preScanner.scan();
}
tis.reset();
if (detectedCharset == null) {
return Collections.emptyList();
}
return List.of(new EncodingResult(detectedCharset, 1.0f,
detectedCharset.name(), EncodingResult.ResultType.DECLARATIVE));
}
public int getMarkLimit() {
return markLimit;
}
/**
* How far into the stream to scan for a {@code <meta charset>} declaration.
* Default is {@value #META_TAG_BUFFER_SIZE} bytes.
*/
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
public boolean isSkipBOM() {
return skipBOM;
}
/**
* When {@code true}, skip the BOM check and rely on {@code BOMDetector} in the
* chain. This allows {@code CharSoupEncodingDetector} to arbitrate between a
* BOM and a contradicting {@code <meta charset>} declaration.
* Default is {@code false}.
*/
public void setSkipBOM(boolean skipBOM) {
this.skipBOM = skipBOM;
}
}