EncodingResult.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.nio.charset.Charset;
/**
* A charset detection result pairing a {@link Charset} with a confidence score
* and a {@link ResultType} indicating the nature of the evidence.
*
* <h3>Result types</h3>
* <ul>
* <li>{@link ResultType#DECLARATIVE} ��� the document explicitly stated its
* encoding (BOM, HTML {@code <meta charset>}). These are authoritative
* claims about author intent and get preference over inferred results
* <em>when consistent with the actual bytes</em>.</li>
* <li>{@link ResultType#STRUCTURAL} ��� byte-grammar proof (ISO-2022 escape
* sequences, UTF-8 multibyte validation). The encoding is proven by the
* byte structure itself, independent of any declaration.</li>
* <li>{@link ResultType#STATISTICAL} ��� probabilistic inference from a
* statistical model. The {@code confidence} float is meaningful here
* for ranking among candidates; for DECLARATIVE and STRUCTURAL results
* it is conventionally {@code 1.0} but carries no additional information.</li>
* </ul>
*
* @since Apache Tika 4.0
*/
public class EncodingResult {
/**
* The nature of the evidence that produced this result.
*/
public enum ResultType {
/**
* The document explicitly declared its encoding (BOM, HTML meta charset).
* Authoritative about author intent; preferred over inferred results when
* consistent with the actual bytes.
*/
DECLARATIVE,
/**
* The encoding is proven by byte-grammar structure (ISO-2022 escape
* sequences, UTF-8 multibyte validation). Not a guess ��� the byte
* patterns are only valid in this encoding.
*/
STRUCTURAL,
/**
* Probabilistic inference from a statistical model. The confidence
* float is meaningful for ranking among candidates.
*/
STATISTICAL
}
private final Charset charset;
private final float confidence;
/**
* The detector's original label for this result. Usually identical to
* {@code charset.name()}, but may differ when the detector uses training
* labels that are finer-grained than the Java charset registry (e.g.
* {@code "IBM420-ltr"} / {@code "IBM420-rtl"} both map to Java's
* {@code "IBM420"}, and {@code "windows-874"} maps to Java's canonical
* {@code "x-windows-874"}). Preserved so that evaluation tooling and
* callers that care about sub-charset properties can access the original
* prediction without going through {@code Charset.name()}.
*/
private final String label;
private final ResultType resultType;
/**
* Constructs a STATISTICAL result. Existing detectors that do not yet
* classify their evidence type default to statistical (probabilistic)
* treatment, which is the safe, arbitratable assumption.
*
* @param charset the detected charset; must not be {@code null}
* @param confidence detection confidence in {@code [0.0, 1.0]}
*/
public EncodingResult(Charset charset, float confidence) {
this(charset, confidence, charset.name(), ResultType.STATISTICAL);
}
/**
* Constructs a STATISTICAL result with a detector-specific label.
*
* @param charset the detected charset; must not be {@code null}
* @param confidence detection confidence in {@code [0.0, 1.0]}
* @param label the detector's original label (e.g. {@code "IBM420-ltr"});
* if {@code null}, defaults to {@code charset.name()}
*/
public EncodingResult(Charset charset, float confidence, String label) {
this(charset, confidence, label, ResultType.STATISTICAL);
}
/**
* Constructs a result with an explicit {@link ResultType}.
*
* @param charset the detected charset; must not be {@code null}
* @param confidence detection confidence in {@code [0.0, 1.0]}
* @param label the detector's original label; if {@code null},
* defaults to {@code charset.name()}
* @param resultType the nature of the evidence; must not be {@code null}
*/
public EncodingResult(Charset charset, float confidence, String label,
ResultType resultType) {
if (charset == null) {
throw new IllegalArgumentException("charset must not be null");
}
if (resultType == null) {
throw new IllegalArgumentException("resultType must not be null");
}
this.charset = charset;
this.confidence = Math.max(0f, Math.min(1f, confidence));
this.label = (label != null) ? label : charset.name();
this.resultType = resultType;
}
public Charset getCharset() {
return charset;
}
/**
* Detection confidence in {@code [0.0, 1.0]}. Meaningful for ranking
* among {@link ResultType#STATISTICAL} candidates. For
* {@link ResultType#DECLARATIVE} and {@link ResultType#STRUCTURAL} results
* the value is conventionally {@code 1.0} but carries no additional
* information beyond the type itself.
*/
public float getConfidence() {
return confidence;
}
/**
* The nature of the evidence that produced this result.
*
* @see ResultType
*/
public ResultType getResultType() {
return resultType;
}
/**
* The detector's original label for this result. Usually identical to
* {@link #getCharset()}{@code .name()}, but preserved when the detector
* uses finer-grained labels than the Java charset registry supports (e.g.
* {@code "IBM420-ltr"}, {@code "IBM420-rtl"}, {@code "windows-874"}).
*/
public String getLabel() {
return label;
}
@Override
public String toString() {
String cs = charset.name();
String lbl = label.equals(cs) ? cs : label + "(" + cs + ")";
return lbl + "@" + String.format(java.util.Locale.ROOT, "%.2f", confidence)
+ "[" + resultType + "]";
}
}