DefaultDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
* A composite detector that orchestrates the detection pipeline:
* <ol>
* <li>MimeTypes (magic byte) detection</li>
* <li>Container and other detectors loaded via SPI</li>
* <li>TextDetector as fallback for unknown types</li>
* <li>Returns the most specific type detected</li>
* </ol>
* <p>
* Detectors are loaded and returned in a specified order, of user supplied
* followed by non-MimeType Tika detectors.
* If you need to control the order of the Detectors, you should instead
* construct your own {@link CompositeDetector} and pass in the list
* of Detectors in the required order.
* <p>
* Individual detectors that need random access (e.g., for container inspection)
* handle their own spooling by calling {@link TikaInputStream#getFile()}.
*
* @since Apache Tika 0.9
*/
@TikaComponent(spi = false)
public class DefaultDetector extends CompositeDetector {
private static final long serialVersionUID = -8170114575326908027L;
private transient final ServiceLoader loader;
private final Collection<Class<? extends Detector>> excludedClasses;
private final MimeTypes mimeTypes;
private final TextDetector textDetector;
public DefaultDetector(MimeTypes types, ServiceLoader loader,
Collection<Class<? extends Detector>> excludeDetectors) {
super(types.getMediaTypeRegistry(), getDefaultDetectors(loader, excludeDetectors));
this.loader = loader;
this.mimeTypes = types;
this.textDetector = new TextDetector();
this.excludedClasses = excludeDetectors != null ?
Collections.unmodifiableCollection(new ArrayList<>(excludeDetectors)) :
Collections.emptySet();
}
public DefaultDetector(MimeTypes types, ServiceLoader loader) {
this(types, loader, Collections.emptySet());
}
public DefaultDetector(MimeTypes types, ClassLoader loader) {
this(types, new ServiceLoader(loader));
}
public DefaultDetector(ClassLoader loader) {
this(MimeTypes.getDefaultMimeTypes(), loader);
}
public DefaultDetector(MimeTypes types) {
this(types, new ServiceLoader());
}
public DefaultDetector() {
this(MimeTypes.getDefaultMimeTypes());
}
/**
* Finds all statically loadable detectors and sort the list by name,
* rather than discovery order. Detectors are used in the given order,
* so put the Tika parsers last so that non-Tika (user supplied)
* parsers can take precedence.
* <p>
* If an {@link OverrideDetector} is loaded, it takes precedence over
* all other detectors.
* <p>
* Note: MimeTypes is handled separately in the detect() method, not included here.
*
* @param loader service loader
* @return ordered list of statically loadable detectors
*/
private static List<Detector> getDefaultDetectors(ServiceLoader loader,
Collection<Class<? extends Detector>>
excludeDetectors) {
List<Detector> detectors =
loader.loadStaticServiceProviders(Detector.class, excludeDetectors);
ServiceLoaderUtils.sortLoadedClasses(detectors);
//look for the override index and put that first
int overrideIndex = -1;
int i = 0;
for (Detector detector : detectors) {
if (detector instanceof OverrideDetector) {
overrideIndex = i;
break;
}
i++;
}
if (overrideIndex > -1) {
Detector detector = detectors.remove(overrideIndex);
detectors.add(0, detector);
}
return detectors;
}
@Override
public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext)
throws IOException {
// 1. Magic detection via MimeTypes
MediaType magicType = mimeTypes.detect(tis, metadata, parseContext);
metadata.set(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED, magicType.toString());
// 2. Run other detectors (container detectors, etc.)
// Note: Container detectors that need random access handle their own spooling
MediaType detectedType = super.detect(tis, metadata, parseContext);
// 3. Text detection - only if still unknown
MediaType textType = null;
if (MediaType.OCTET_STREAM.equals(detectedType) &&
MediaType.OCTET_STREAM.equals(magicType)) {
textType = textDetector.detect(tis, metadata, parseContext);
}
// 4. Return most specific
return mostSpecific(magicType, detectedType, textType);
}
private MediaType mostSpecific(MediaType magicType, MediaType detectedType, MediaType textType) {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
// Collect non-null, non-octet-stream candidates
MediaType best = MediaType.OCTET_STREAM;
// Start with magic type as baseline if valid
if (magicType != null && !MediaType.OCTET_STREAM.equals(magicType)) {
best = magicType;
}
// Container detectors may find more specific types (e.g., OLE -> msword)
// or less specific (e.g., commons-compress tar vs magic gtar)
// Use the registry to determine which is more specific
if (detectedType != null && !MediaType.OCTET_STREAM.equals(detectedType)) {
if (MediaType.OCTET_STREAM.equals(best)) {
best = detectedType;
} else if (registry.isSpecializationOf(detectedType, best)) {
// detectedType is more specific than best
best = detectedType;
} else if (!registry.isSpecializationOf(best, detectedType)) {
// Neither is a specialization of the other - prefer container detection
// for unrelated types (e.g., different format families)
best = detectedType;
}
// else: best is already more specific than detectedType, keep best
}
// Text detection as fallback only if still unknown
if (MediaType.OCTET_STREAM.equals(best) && textType != null &&
!MediaType.OCTET_STREAM.equals(textType)) {
best = textType;
}
return best;
}
@Override
public List<Detector> getDetectors() {
if (loader != null && loader.isDynamic()) {
List<Detector> detectors = loader.loadDynamicServiceProviders(Detector.class);
if (!detectors.isEmpty()) {
detectors.addAll(super.getDetectors());
return detectors;
} else {
return super.getDetectors();
}
} else {
return super.getDetectors();
}
}
/**
* Returns the classes that were explicitly excluded when constructing this detector.
* Used for round-trip serialization to preserve exclusion configuration.
*
* @return unmodifiable collection of excluded detector classes, never null
*/
public Collection<Class<? extends Detector>> getExcludedClasses() {
return excludedClasses;
}
}