InferenceConfig.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.inference;
import java.io.Serializable;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
/**
* Configuration for the inference metadata filters.
* <p>
* Controls both the chunking behaviour (how text is split before inference)
* and the remote endpoint settings (URL, model, auth, timeout).
*/
public class InferenceConfig implements Serializable {
private static final long serialVersionUID = 1L;
// ---- endpoint settings ------------------------------------------------
/** Base URL of the embeddings API (no trailing slash). */
private String baseUrl = "http://127.0.0.1:8000";
/** Model identifier sent in the embeddings request. */
private String model = "";
/** Optional API key. Empty means no auth. */
private String apiKey = "";
/** HTTP read timeout in seconds. */
private int timeoutSeconds = 120;
// ---- chunking settings ------------------------------------------------
/**
* Maximum number of characters per chunk. The chunker will try to break
* at markdown heading or paragraph boundaries before hitting this limit.
*/
private int maxChunkChars = 1500;
/**
* Number of characters of overlap between consecutive chunks.
* Helps ensure no context is lost at chunk boundaries.
*/
private int overlapChars = 200;
/**
* The metadata field to read the source text from.
* Defaults to {@code tika:content}.
*/
private String contentField = TikaCoreProperties.TIKA_CONTENT.getName();
/**
* The metadata field where the JSON chunk array is written.
*/
private String outputField = TikaCoreProperties.TIKA_CHUNKS;
/**
* If {@code true}, the embedding filter is skipped entirely for this
* request. Useful when the filter is configured as the default filter
* but should be bypassed for specific documents (e.g. binary blobs,
* very short metadata-only records). Set via {@code ParseContext} JSON:
* {@code {"openai-embedding-filter": {"skipEmbedding": true}}}.
* Default is {@code false}.
*/
private boolean skipEmbedding = false;
/**
* If {@code true}, the content field (default {@code tika:content}) is
* removed from metadata after chunking and embedding. This avoids storing
* the full text twice (once as raw content, once inside the chunks).
* Default is {@code false}.
*/
private boolean clearContentAfterChunking = false;
/**
* Maximum number of chunk texts to send in a single embeddings API
* request. If a document produces more chunks than this, the filter
* splits them into multiple HTTP calls.
* <p>
* OpenAI's embeddings endpoint caps at 2048 inputs per request;
* the default here (256) is a safe value that works across most
* providers while keeping request sizes reasonable.
*/
private int maxBatchSize = 256;
/**
* Maximum number of chunks to produce per document. If a document's
* text generates more chunks than this, excess chunks are silently
* dropped. This prevents pathologically large documents from
* triggering an unbounded number of embedding API calls.
* <p>
* Default is 1024. Set to {@code -1} for no limit.
*/
private int maxChunks = 1024;
// ---- getters / setters ------------------------------------------------
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) throws TikaConfigException {
this.baseUrl = baseUrl;
}
public String getModel() {
return model;
}
public void setModel(String model) {
this.model = model;
}
public String getApiKey() {
return apiKey;
}
public void setApiKey(String apiKey) throws TikaConfigException {
this.apiKey = apiKey;
}
public int getTimeoutSeconds() {
return timeoutSeconds;
}
public void setTimeoutSeconds(int timeoutSeconds) {
this.timeoutSeconds = timeoutSeconds;
}
public int getMaxChunkChars() {
return maxChunkChars;
}
public void setMaxChunkChars(int maxChunkChars) {
this.maxChunkChars = maxChunkChars;
}
public int getOverlapChars() {
return overlapChars;
}
public void setOverlapChars(int overlapChars) {
this.overlapChars = overlapChars;
}
public String getContentField() {
return contentField;
}
public void setContentField(String contentField) {
this.contentField = contentField;
}
public String getOutputField() {
return outputField;
}
public void setOutputField(String outputField) {
this.outputField = outputField;
}
public boolean isSkipEmbedding() {
return skipEmbedding;
}
public void setSkipEmbedding(boolean skipEmbedding) {
this.skipEmbedding = skipEmbedding;
}
public boolean isClearContentAfterChunking() {
return clearContentAfterChunking;
}
public void setClearContentAfterChunking(boolean clearContentAfterChunking) {
this.clearContentAfterChunking = clearContentAfterChunking;
}
public int getMaxBatchSize() {
return maxBatchSize;
}
/**
* Set the maximum number of chunks per embeddings API request.
* Must be at least 1.
*/
public void setMaxBatchSize(int maxBatchSize) {
if (maxBatchSize < 1) {
throw new IllegalArgumentException(
"maxBatchSize must be at least 1, got: " + maxBatchSize);
}
this.maxBatchSize = maxBatchSize;
}
public int getMaxChunks() {
return maxChunks;
}
/**
* Set the maximum number of chunks per document.
* Set to {@code -1} for no limit. Must be {@code -1} or at least {@code 1}.
*/
public void setMaxChunks(int maxChunks) {
if (maxChunks < 1 && maxChunks != -1) {
throw new IllegalArgumentException(
"maxChunks must be -1 (no limit) or at least 1, got: " + maxChunks);
}
this.maxChunks = maxChunks;
}
/**
* Runtime-only config that prevents modification of security-sensitive
* and cost-sensitive fields ({@code baseUrl}, {@code apiKey},
* {@code model}) at parse time.
* <p>
* These fields must be set at initialization via the config file.
* If a runtime {@code ParseContext} JSON config attempts to set them,
* the overridden setters throw {@link TikaConfigException}.
*/
public static class RuntimeConfig extends InferenceConfig {
@Override
public void setBaseUrl(String baseUrl) throws TikaConfigException {
if (!StringUtils.isBlank(baseUrl)) {
throw new TikaConfigException(
"Cannot modify baseUrl at runtime. "
+ "URLs must be configured at initialization time.");
}
}
@Override
public void setApiKey(String apiKey) throws TikaConfigException {
if (!StringUtils.isBlank(apiKey)) {
throw new TikaConfigException(
"Cannot modify apiKey at runtime. "
+ "API keys must be configured at initialization time.");
}
}
@Override
public void setModel(String model) {
throw new IllegalStateException(
"Cannot modify model at runtime. "
+ "Models must be configured at initialization time. "
+ "If you need a different model, configure a "
+ "separate filter instance.");
}
@Override
public void setMaxBatchSize(int maxBatchSize) {
throw new IllegalStateException(
"Cannot modify maxBatchSize at runtime. "
+ "Batch size must be configured at initialization time.");
}
@Override
public void setMaxChunks(int maxChunks) {
throw new IllegalStateException(
"Cannot modify maxChunks at runtime. "
+ "Chunk limits must be configured at initialization time.");
}
}
}