ExternalParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.external;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.NullOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Parser that uses an external program (like catdoc or pdf2txt) to extract
* text content and metadata from a given document.
*
* @deprecated Use {@link org.apache.tika.parser.external2.ExternalParser} instead.
* This class will be removed in a future version of Tika.
*/
@Deprecated
public class ExternalParser implements Parser {
private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class);
/**
* The token, which if present in the Command string, will
* be replaced with the input filename.
* Alternately, the input data can be streamed over STDIN.
*/
public static final String INPUT_FILE_TOKEN = "${INPUT}";
/**
* The token, which if present in the Command string, will
* be replaced with the output filename.
* Alternately, the output data can be collected on STDOUT.
*/
public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
private static final long serialVersionUID = -1079128990650687037L;
//make this parameterizable
private final long timeoutMs = 60000;
/**
* Media types supported by the external program.
*/
private Set<MediaType> supportedTypes = Collections.emptySet();
/**
* Regular Expressions to run over STDOUT to
* extract Metadata.
*/
private Map<Pattern, String> metadataPatterns = null;
/**
* The external command to invoke.
*
* @see Runtime#exec(String[])
*/
private String[] command = new String[]{"cat"};
/**
* A consumer for ignored Lines
*/
private LineConsumer ignoredLineConsumer = LineConsumer.NULL;
/**
* Starts a thread that reads and discards the contents of the
* standard stream of the given process. Potential exceptions
* are ignored, and the stream is closed once fully processed.
* Note: calling this starts a new thread and blocks the current(caller)
* thread until the new thread dies
*
* @param stream stream to be ignored
*/
private static void ignoreStream(final InputStream stream) {
ignoreStream(stream, true);
}
/**
* Starts a thread that reads and discards the contents of the
* standard stream of the given process. Potential exceptions
* are ignored, and the stream is closed once fully processed.
*
* @param stream stream to sent to black hole (a k a null)
* @param waitForDeath when {@code true} the caller thread will be
* blocked till the death of new thread.
* @return The thread that is created and started
*/
private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) {
Thread t = new Thread(() -> {
try {
IOUtils.copy(stream, NullOutputStream.INSTANCE);
} catch (IOException e) {
//swallow
} finally {
IOUtils.closeQuietly(stream);
}
});
t.start();
if (waitForDeath) {
try {
t.join();
} catch (InterruptedException ignore) {
}
}
return t;
}
/**
* Checks to see if the command can be run. Typically used with
* something like "myapp --version" to check to see if "myapp"
* is installed and on the path.
*
* @param checkCmd The check command to run
* @param errorValue What is considered an error value?
*/
public static boolean check(String checkCmd, int... errorValue) {
return check(new String[]{checkCmd}, errorValue);
}
public static boolean check(String[] checkCmd, int... errorValue) {
if (errorValue.length == 0) {
errorValue = new int[]{127};
}
Process process = null;
try {
process = Runtime.getRuntime().exec(checkCmd);
Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(), false);
Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false);
stdErrSuckerThread.join();
stdOutSuckerThread.join();
//make the timeout parameterizable
boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS);
if (!finished) {
throw new TimeoutException();
}
int result = process.exitValue();
LOG.debug("exit value for {}: {}", checkCmd[0], result);
for (int err : errorValue) {
if (result == err) {
return false;
}
}
return true;
} catch (IOException | InterruptedException | TimeoutException e) {
LOG.debug("exception trying to run " + checkCmd[0], e);
// Some problem, command is there or is broken
return false;
} catch (SecurityException se) {
// External process execution is banned by the security manager
throw se;
} catch (Error err) {
if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") ||
err.getMessage().contains("UNIXProcess"))) {
LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err);
//"Error forking command due to JVM locale bug
//(see TIKA-1526 and SOLR-6387)"
return false;
}
//throw if a different kind of error
throw err;
} finally {
if (process != null) {
process.destroyForcibly();
}
}
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return getSupportedTypes();
}
public Set<MediaType> getSupportedTypes() {
return supportedTypes;
}
public void setSupportedTypes(Set<MediaType> supportedTypes) {
this.supportedTypes = Collections.unmodifiableSet(new HashSet<>(supportedTypes));
}
public String[] getCommand() {
return command;
}
/**
* Sets the command to be run. This can include either of
* {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
* if the command needs filenames.
*
* @see Runtime#exec(String[])
*/
public void setCommand(String... command) {
this.command = command;
}
/**
* Gets lines consumer
*
* @return consumer instance
*/
public LineConsumer getIgnoredLineConsumer() {
return ignoredLineConsumer;
}
/**
* Set a consumer for the lines ignored by the parse functions
*
* @param ignoredLineConsumer consumer instance
*/
public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) {
this.ignoredLineConsumer = ignoredLineConsumer;
}
public Map<Pattern, String> getMetadataExtractionPatterns() {
return metadataPatterns;
}
/**
* Sets the map of regular expression patterns and Metadata
* keys. Any matching patterns will have the matching
* metadata entries set.
* Set this to null to disable Metadata extraction.
*/
public void setMetadataExtractionPatterns(Map<Pattern, String> patterns) {
this.metadataPatterns = patterns;
}
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
* has been called to set patterns.
*/
public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
TemporaryResources tmp = new TemporaryResources();
try {
parse(tis, xhtml, metadata, tmp);
} finally {
tmp.dispose();
}
}
private void parse(TikaInputStream tis, XHTMLContentHandler xhtml, Metadata metadata,
TemporaryResources tmp) throws IOException, SAXException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
File output = null;
// Build our command
String[] cmd;
if (command.length == 1) {
cmd = command[0].split(" ");
} else {
cmd = new String[command.length];
System.arraycopy(command, 0, cmd, 0, command.length);
}
for (int i = 0; i < cmd.length; i++) {
if (cmd[i].contains(INPUT_FILE_TOKEN)) {
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, tis.getFile().getPath());
inputToStdIn = false;
}
if (cmd[i].contains(OUTPUT_FILE_TOKEN)) {
output = tmp.createTemporaryFile();
outputFromStdOut = false;
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
}
}
// Execute
Process process = null;
try {
if (cmd.length == 1) {
process = Runtime.getRuntime().exec(cmd[0]);
} else {
process = Runtime.getRuntime().exec(cmd);
}
} catch (Exception e) {
LOG.warn("problem with process exec", e);
}
try {
if (inputToStdIn) {
sendInput(process, tis);
} else {
process.getOutputStream().close();
}
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
if (hasPatterns) {
extractMetadata(err, metadata);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
extractMetadata(out, metadata);
}
} else {
ignoreStream(err);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
ignoreStream(out);
}
}
} finally {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
}
// Grab the output if we haven't already
if (!outputFromStdOut) {
try (FileInputStream fileInputStream = new FileInputStream(output)) {
extractOutput(fileInputStream, xhtml);
}
}
}
/**
* Starts a thread that extracts the contents of the standard output
* stream of the given process to the given XHTML content handler.
* The standard output stream is closed once fully processed.
*
* @param stream
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
xhtml.characters(buffer, 0, n);
}
xhtml.endElement("p");
xhtml.endDocument();
}
}
/**
* Starts a thread that sends the contents of the given input stream
* to the standard input stream of the given process. Potential
* exceptions are ignored, and the standard input stream is closed
* once fully processed. Note that the given input stream is <em>not</em>
* closed by this method.
*
* @param process process
* @param stream input stream
*/
private void sendInput(final Process process, final InputStream stream) {
Thread t = new Thread(() -> {
OutputStream stdin = process.getOutputStream();
try {
IOUtils.copy(stream, stdin);
} catch (IOException e) {
//swallow
}
});
t.start();
try {
t.join();
} catch (InterruptedException ignore) {
}
}
private void extractMetadata(final InputStream stream, final Metadata metadata) {
Thread t = new Thread(() -> {
BufferedReader reader;
reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
try {
String line;
while ((line = reader.readLine()) != null) {
boolean consumed = false;
for (Map.Entry<Pattern, String> entry : metadataPatterns.entrySet()) {
Matcher m = entry.getKey().matcher(line);
if (m.find()) {
consumed = true;
if (entry.getValue() != null &&
!entry.getValue().equals("")) {
metadata.add(entry.getValue(), m.group(1));
} else {
metadata.add(m.group(1), m.group(2));
}
}
}
if (!consumed) {
ignoredLineConsumer.consume(line);
}
}
} catch (IOException e) {
// Ignore
} finally {
IOUtils.closeQuietly(reader);
IOUtils.closeQuietly(stream);
}
});
t.start();
try {
t.join();
} catch (InterruptedException ignore) {
}
}
/**
* Consumer contract
*
* @since Apache Tika 1.14
*/
public interface LineConsumer extends Serializable {
/**
* A null consumer
*/
LineConsumer NULL = line -> {
// ignores
};
/**
* Consume a line
*
* @param line a line of string
*/
void consume(String line);
}
}