EncodingDetectorContext.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
/**
* Context object that collects encoding detection results from base detectors.
* Stored in {@link org.apache.tika.parser.ParseContext} by
* {@link CompositeEncodingDetector} so that a {@link MetaEncodingDetector}
* can see all candidates and arbitrate. Removed after detection to prevent
* contamination during recursive parsing.
*
* <p>Each base detector contributes a ranked {@link List} of
* {@link EncodingResult}s. The context exposes the top result from each
* detector as the primary signal, and provides access to all candidates
* for richer arbitration strategies.</p>
*
* @since Apache Tika 3.2
*/
public class EncodingDetectorContext {
private final List<Result> results = new ArrayList<>();
private String arbitrationInfo;
/**
* Record the ranked results from a child detector.
*
* @param encodingResults ranked results, highest confidence first; must not be empty
* @param detectorName simple class name of the detector
*/
public void addResult(List<EncodingResult> encodingResults, String detectorName) {
if (encodingResults != null && !encodingResults.isEmpty()) {
results.add(new Result(encodingResults, detectorName));
}
}
/**
* @return unmodifiable list of all per-detector results in detection order
*/
public List<Result> getResults() {
return Collections.unmodifiableList(results);
}
/**
* Returns the unique charsets from ALL results of every detector,
* in detection order (top result first within each detector).
*
* <p>Using all candidates rather than just each detector's top-1 is
* important when a single detector returns a ranked list (e.g., Mojibuster
* on a short probe returns [windows-1252, windows-1250, Shift-JIS]). If
* only the top-1 were used, CharSoup would see a single charset and
* return "unanimous" without ever attempting arbitration.</p>
*/
public Set<Charset> getUniqueCharsets() {
Set<Charset> charsets = new LinkedHashSet<>();
for (Result r : results) {
for (EncodingResult er : r.getEncodingResults()) {
charsets.add(er.getCharset());
}
}
return charsets;
}
/**
* Returns the highest confidence seen for the given charset across all
* detector results (not just top results). Useful for arbitrators that
* want to propagate the base detector's confidence for the winning charset.
*/
public float getTopConfidenceFor(Charset charset) {
float best = 0f;
for (Result r : results) {
for (EncodingResult er : r.getEncodingResults()) {
if (er.getCharset().equals(charset) && er.getConfidence() > best) {
best = er.getConfidence();
}
}
}
return best;
}
/**
* Set by the meta detector to describe how it reached its decision.
* Values: "unanimous", "scored", "no-stream", "empty-stream", etc.
*/
public void setArbitrationInfo(String info) {
this.arbitrationInfo = info;
}
public String getArbitrationInfo() {
return arbitrationInfo;
}
/**
* A single detector's contribution: its ranked list of candidates and its name.
*/
public static class Result {
private final List<EncodingResult> encodingResults;
private final String detectorName;
public Result(List<EncodingResult> encodingResults, String detectorName) {
this.encodingResults = Collections.unmodifiableList(
new ArrayList<>(encodingResults));
this.detectorName = detectorName;
}
/**
* All ranked results from this detector, highest confidence first.
*/
public List<EncodingResult> getEncodingResults() {
return encodingResults;
}
/**
* The top-ranked charset from this detector.
*/
public Charset getCharset() {
return encodingResults.get(0).getCharset();
}
/**
* The confidence of the top-ranked result from this detector.
*/
public float getConfidence() {
return encodingResults.get(0).getConfidence();
}
/**
* The {@link EncodingResult.ResultType} of the top-ranked result from this detector.
*/
public EncodingResult.ResultType getResultType() {
return encodingResults.get(0).getResultType();
}
public String getDetectorName() {
return detectorName;
}
@Override
public String toString() {
return detectorName + "=" + encodingResults.get(0);
}
}
}