PipesForkResult.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.fork;
import java.util.Collections;
import java.util.List;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.api.PipesResult;
/**
* Result from parsing a file with {@link PipesForkParser}.
* <p>
* This wraps the {@link PipesResult} and provides convenient access to
* the parsed content and metadata.
* <p>
* Content is available in the metadata via {@link TikaCoreProperties#TIKA_CONTENT}.
* <p>
* <b>Important - Accessing Results:</b>
* <ul>
* <li><b>RMETA mode (default):</b> Use {@link #getMetadataList()} to access content and
* metadata from the container document AND all embedded documents. The convenience
* methods {@link #getContent()} and {@link #getMetadata()} only return the container
* document's data - embedded document content will be missed!</li>
* <li><b>CONCATENATE mode:</b> Include only metadata from the container document, but
* concatenated content from the container document and all attachments.</li>
* </ul>
*/
public class PipesForkResult {
private final PipesResult pipesResult;
public PipesForkResult(PipesResult pipesResult) {
this.pipesResult = pipesResult;
}
/**
* Get the result status.
*
* @return the result status
*/
public PipesResult.RESULT_STATUS getStatus() {
return pipesResult.status();
}
/**
* Check if the parsing was successful.
*
* @return true if parsing succeeded
*/
public boolean isSuccess() {
return pipesResult.isSuccess();
}
/**
* Check if there was a process crash (OOM, timeout, etc.).
*
* @return true if the forked process crashed
*/
public boolean isProcessCrash() {
return pipesResult.isProcessCrash();
}
/**
* Check if there was a fatal error (failed to initialize pipes system).
*
* @return true if there was a fatal error
*/
public boolean isFatal() {
return pipesResult.isFatal();
}
/**
* Check if there was an initialization failure (fetcher/emitter initialization issues).
*
* @return true if there was an initialization failure
*/
public boolean isInitializationFailure() {
return pipesResult.isInitializationFailure();
}
/**
* Check if there was a task exception (fetch/emit/parse issues for a specific request).
*
* @return true if there was a task exception
*/
public boolean isTaskException() {
return pipesResult.isTaskException();
}
/**
* Get the list of metadata objects from parsing.
* <p>
* <b>This is the recommended method for RMETA mode (the default).</b>
* <p>
* <b>RMETA mode:</b> Returns one metadata object per document - the first is
* the container document, followed by each embedded document. Each metadata
* object contains:
* <ul>
* <li>Content via {@link TikaCoreProperties#TIKA_CONTENT}</li>
* <li>Document metadata (title, author, dates, etc.)</li>
* <li>Any parse exceptions via {@link TikaCoreProperties#EMBEDDED_EXCEPTION}</li>
* </ul>
* <p>
* <b>CONCATENATE mode:</b> Returns a single metadata object containing the
* container's metadata and concatenated content from all documents.
*
* @return the list of metadata objects, or empty list if none
*/
public List<Metadata> getMetadataList() {
if (pipesResult.emitData() == null) {
return Collections.emptyList();
}
return pipesResult.emitData().getMetadataList();
}
/**
* Get the content from the container document only.
* <p>
* <b>WARNING - RMETA mode:</b> In RMETA mode, this returns ONLY the container
* document's content. Content from embedded documents is NOT included. To get
* all content including embedded documents, iterate over {@link #getMetadataList()}
* and retrieve {@link TikaCoreProperties#TIKA_CONTENT} from each metadata object.
* <p>
* <b>CONCATENATE mode:</b> In CONCATENATE mode, this returns all content
* (container + embedded) since everything is concatenated into a single
* metadata object. This method works as expected in CONCATENATE mode.
* <p>
* <b>Recommendation:</b> For RMETA mode (the default), use {@link #getMetadataList()}
* to access content from all documents. This method is most appropriate for
* CONCATENATE mode or when you only need the container document's content.
*
* @return the container document's content, or null if not available
* @see #getMetadataList()
*/
public String getContent() {
List<Metadata> metadataList = getMetadataList();
if (metadataList.isEmpty()) {
return null;
}
return metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
}
/**
* Get the container document's metadata only.
* <p>
* <b>WARNING - RMETA mode:</b> In RMETA mode, this returns ONLY the container
* document's metadata. Metadata from embedded documents (including their content,
* titles, authors, and any parse exceptions) is NOT included. To access metadata
* from all documents, use {@link #getMetadataList()}.
* <p>
* <b>CONCATENATE mode:</b> In CONCATENATE mode, there is only one metadata
* object containing the container's metadata and concatenated content from
* all documents. By the nature of CONCATENATE mode, you are losing metadata
* from embedded files, and Tika is silently swallowing exceptions in embedded files.
* <p>
* <b>Recommendation:</b> For RMETA mode (the default), use {@link #getMetadataList()}
* to access metadata from all documents, including embedded document exceptions
* (stored in {@link TikaCoreProperties#EMBEDDED_EXCEPTION}).
*
* @return the container document's metadata, or null if not available
* @see #getMetadataList()
*/
public Metadata getMetadata() {
List<Metadata> metadataList = getMetadataList();
if (metadataList.isEmpty()) {
return null;
}
return metadataList.get(0);
}
/**
* Get any error message associated with the result.
*
* @return the error message, or null if none
*/
public String getMessage() {
return pipesResult.message();
}
/**
* Get the underlying PipesResult for advanced access.
*
* @return the pipes result
*/
public PipesResult getPipesResult() {
return pipesResult;
}
@Override
public String toString() {
return "PipesForkResult{" +
"status=" + getStatus() +
", metadataCount=" + getMetadataList().size() +
", message=" + getMessage() +
'}';
}
}