ParseMode.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.api;
import java.util.Locale;
import org.apache.tika.config.TikaComponent;
/**
* Controls how embedded documents are handled during parsing.
* <p>
* This can be set as a default in PipesConfig (loaded from tika-config.json)
* or overridden per-file via ParseContext.
*/
@TikaComponent(name = "parse-mode")
public enum ParseMode {
/**
* Each embedded file gets its own metadata object in a list.
* <p>
* This is equivalent to the -J option in tika-app and the /rmeta endpoint
* in tika-server. The result is a list of metadata objects, one for each
* document (container + all embedded documents).
*/
RMETA,
/**
* Concatenates content from all embedded files into a single document.
* <p>
* This is equivalent to the legacy tika-app behavior and the /tika endpoint
* in tika-server. The result is a single metadata object with concatenated
* content from all documents.
*/
CONCATENATE,
/**
* Performs digest (if configured) and content type detection only.
* <p>
* No parsing occurs - embedded documents are not extracted and no content
* is returned. Use this mode when you only need file identification
* (mime type, hash) without text extraction.
*/
NO_PARSE,
/**
* Concatenates content and emits only the raw content string, with no
* metadata and no JSON wrapper.
* <p>
* This mode parses like CONCATENATE (producing a single metadata object with
* merged content from all embedded documents), but at emit time, emitters
* write only the value of {@code X-TIKA:content} as a raw string instead of
* serializing the full metadata list as JSON.
* <p>
* This is useful when you want plain text, markdown, or HTML output files
* without any metadata overhead.
*/
CONTENT_ONLY,
/**
* Extracts embedded document bytes and emits them, with full RMETA metadata.
* <p>
* This mode parses like RMETA (returning a metadata object per document) AND
* automatically extracts and emits embedded document bytes. An emitter is
* required for the byte extraction.
* <p>
* With PASSBACK_ALL emit strategy, embedded bytes are still emitted during
* parsing, but metadata is passed back to the client instead of being emitted.
* This is useful when you want bytes written to storage but need metadata
* returned for further processing (e.g., indexing to a database).
* <p>
* This mode simplifies byte extraction by handling all the internal setup
* (UnpackExtractor, EmittingUnpackHandler) automatically.
* Users just need to specify the emitter in UnpackConfig or FetchEmitTuple.
*/
UNPACK;
/**
* Parses a string to a ParseMode enum value.
*
* @param modeString the string to parse (case-insensitive)
* @return the corresponding ParseMode
* @throws IllegalArgumentException if the string doesn't match any mode
*/
public static ParseMode parse(String modeString) {
if (modeString == null) {
throw new IllegalArgumentException("Parse mode cannot be null");
}
String normalized = modeString.toUpperCase(Locale.ROOT).trim();
try {
return ParseMode.valueOf(normalized);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"Invalid parse mode: '" + modeString + "'. " +
"Must be one of: RMETA, CONCATENATE, CONTENT_ONLY, NO_PARSE, UNPACK");
}
}
}