ParserLoader.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config.loader;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.RenderingParser;
import org.apache.tika.renderer.Renderer;
/**
* Loader for parsers with support for:
* <ul>
* <li>SPI fallback via "default-parser" marker with exclusions</li>
* <li>Mime type filtering decorations (_mime-include, _mime-exclude)</li>
* <li>EncodingDetector and Renderer dependency injection</li>
* </ul>
*/
public class ParserLoader extends AbstractSpiComponentLoader<Parser> {
public ParserLoader() {
super("parsers", "default-parser", Parser.class);
}
@Override
protected Parser loadComponent(String name, JsonNode configNode,
LoaderContext context) throws TikaConfigException {
try {
// Extract framework config (decorations like _mime-include/_mime-exclude)
FrameworkConfig framework = FrameworkConfig.extract(
configNode, context.getObjectMapper());
// Instantiate the parser
Parser parser = context.instantiate(name, framework.getComponentConfigNode());
// Apply mime filtering decorations if present
if (framework.getDecoration() != null && framework.getDecoration().hasFiltering()) {
parser = applyMimeFiltering(parser, framework.getDecoration());
}
return parser;
} catch (IOException e) {
throw new TikaConfigException("Failed to load parser: " + name, e);
}
}
@Override
protected Parser createDefaultComposite(Set<Class<? extends Parser>> exclusions,
LoaderContext context) {
return new DefaultParser(
TikaLoader.getMediaTypeRegistry(),
new ServiceLoader(context.getClassLoader()),
exclusions);
}
@Override
protected Parser decorateDefaultComposite(Parser parser, JsonNode configNode,
LoaderContext context) throws TikaConfigException {
if (configNode == null) {
return parser;
}
try {
FrameworkConfig framework = FrameworkConfig.extract(
configNode, context.getObjectMapper());
if (framework.getDecoration() != null && framework.getDecoration().hasFiltering()) {
return applyMimeFiltering(parser, framework.getDecoration());
}
} catch (IOException e) {
throw new TikaConfigException("Failed to apply mime filtering to default-parser", e);
}
return parser;
}
@Override
protected Parser wrapInComposite(List<Parser> parsers, LoaderContext context) {
return new CompositeParser(TikaLoader.getMediaTypeRegistry(), parsers);
}
@Override
@SuppressWarnings("unchecked")
protected Class<? extends Parser> unwrapClass(Parser component) {
if (component instanceof ParserDecorator pd) {
return (Class<? extends Parser>) pd.getWrappedParser().getClass();
}
return component.getClass();
}
@Override
protected Parser postProcess(Parser parser, LoaderContext context)
throws TikaConfigException {
// Inject EncodingDetector and Renderer into parsers that need them
EncodingDetector encodingDetector = context.getEncodingDetector();
Renderer renderer = context.getRenderer();
injectDependenciesRecursively(parser, encodingDetector, renderer);
return parser;
}
/**
* Recursively inject dependencies into a parser and its children.
*/
private void injectDependenciesRecursively(Parser parser, EncodingDetector encodingDetector,
Renderer renderer) {
if (encodingDetector != null && parser instanceof AbstractEncodingDetectorParser aedp) {
aedp.setEncodingDetector(encodingDetector);
}
if (renderer != null && parser instanceof RenderingParser rp) {
rp.setRenderer(renderer);
}
if (parser instanceof CompositeParser cp) {
for (Parser child : cp.getAllComponentParsers()) {
injectDependenciesRecursively(child, encodingDetector, renderer);
}
} else if (parser instanceof ParserDecorator pd) {
injectDependenciesRecursively(pd.getWrappedParser(), encodingDetector, renderer);
}
}
/**
* Apply mime type filtering to a parser.
* Uses ParserDecorator.withMimeFilters() which creates a MimeFilteringDecorator
* that the serializer knows how to handle for round-trip support.
*/
private Parser applyMimeFiltering(Parser parser,
FrameworkConfig.ParserDecoration decoration) {
Set<MediaType> includeTypes = new HashSet<>();
Set<MediaType> excludeTypes = new HashSet<>();
for (String mimeStr : decoration.getMimeInclude()) {
includeTypes.add(MediaType.parse(mimeStr));
}
for (String mimeStr : decoration.getMimeExclude()) {
excludeTypes.add(MediaType.parse(mimeStr));
}
return ParserDecorator.withMimeFilters(parser, includeTypes, excludeTypes);
}
}