ClaudeVLMParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.vlm;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.StringUtils;

/**
 * VLM parser for the <b>Anthropic Claude</b> Messages API.
 * <p>
 * Supports images (JPEG, PNG, GIF, WebP) and PDFs natively. Claude
 * processes each PDF page as both extracted text and a rendered image,
 * understanding layout, charts, tables and diagrams.
 * <p>
 * Authentication uses the {@code x-api-key} header (not Bearer).
 * The required {@code anthropic-version} header is sent automatically.
 * <p>
 * Configuration key: {@code "claude-vlm-parser"}
 *
 * @since Apache Tika 4.0
 */
@TikaComponent(spi = false)
public class ClaudeVLMParser extends AbstractVLMParser {

    private static final long serialVersionUID = 1L;

    private static final String ANTHROPIC_VERSION = "2023-06-01";

    private static final String OCR = "ocr-";

    private static final Set<MediaType> SUPPORTED_TYPES;

    static {
        Set<MediaType> types = new HashSet<>(Arrays.asList(
                // images ��� Claude supports JPEG, PNG, GIF, WebP
                MediaType.image(OCR + "png"),
                MediaType.image(OCR + "jpeg"),
                MediaType.image(OCR + "gif"),
                MediaType.image("webp"),
                MediaType.image(OCR + "webp"),
                // PDFs ��� Claude handles these natively with vision
                MediaType.application("pdf")
        ));
        SUPPORTED_TYPES = Collections.unmodifiableSet(types);
    }

    private static final ObjectMapper MAPPER = new ObjectMapper();

    public ClaudeVLMParser() {
        super(claudeDefaults());
    }

    public ClaudeVLMParser(VLMOCRConfig config) {
        super(config);
    }

    public ClaudeVLMParser(JsonConfig jsonConfig) {
        this(ConfigDeserializer.buildConfig(jsonConfig, VLMOCRConfig.class));
    }

    private static VLMOCRConfig claudeDefaults() {
        try {
            VLMOCRConfig cfg = new VLMOCRConfig();
            cfg.setBaseUrl("https://api.anthropic.com");
            cfg.setModel("claude-sonnet-4-20250514");
            return cfg;
        } catch (TikaConfigException e) {
            // Should never happen on base VLMOCRConfig
            throw new RuntimeException(e);
        }
    }

    @Override
    protected HttpCall buildHttpCall(VLMOCRConfig config,
                                     String base64Data, String mimeType) {
        String json = buildRequestJson(config, base64Data, mimeType);
        String url = stripTrailingSlash(config.getBaseUrl()) + "/v1/messages";

        Map<String, String> headers = new HashMap<>();
        headers.put("anthropic-version", ANTHROPIC_VERSION);
        if (!StringUtils.isBlank(config.getApiKey())) {
            headers.put("x-api-key", config.getApiKey());
        }

        return new HttpCall(url, json, headers);
    }

    @Override
    protected String extractResponseText(String responseBody, Metadata metadata)
            throws TikaException {
        try {
            JsonNode root = MAPPER.readTree(responseBody);

            // Check for error response
            JsonNode errorNode = root.get("error");
            if (errorNode != null) {
                String msg = errorNode.has("message")
                        ? errorNode.get("message").asText()
                        : errorNode.toString();
                throw new TikaException("Claude API error: " + msg);
            }

            // Usage metadata
            JsonNode usage = root.get("usage");
            if (usage != null) {
                if (usage.has("input_tokens")) {
                    metadata.set(VLM_PROMPT_TOKENS, usage.get("input_tokens").asInt());
                }
                if (usage.has("output_tokens")) {
                    metadata.set(VLM_COMPLETION_TOKENS, usage.get("output_tokens").asInt());
                }
            }

            // Content blocks
            JsonNode content = root.get("content");
            if (content == null || !content.isArray() || content.isEmpty()) {
                throw new TikaException(
                        "Claude response contains no content: " + responseBody);
            }

            // Concatenate all text blocks
            StringBuilder sb = new StringBuilder();
            for (JsonNode block : content) {
                if ("text".equals(block.path("type").asText())) {
                    JsonNode textNode = block.get("text");
                    if (textNode != null && !textNode.isNull()) {
                        if (sb.length() > 0) {
                            sb.append("\n");
                        }
                        sb.append(textNode.asText());
                    }
                }
            }
            return sb.toString();
        } catch (IOException e) {
            throw new TikaException(
                    "Failed to parse Claude response JSON: " + e.getMessage(), e);
        }
    }

    @Override
    protected Set<MediaType> getSupportedMediaTypes() {
        return SUPPORTED_TYPES;
    }

    @Override
    protected String configKey() {
        return "claude-vlm-parser";
    }

    @Override
    protected String getHealthCheckUrl(VLMOCRConfig config) {
        // Claude doesn't have a lightweight models endpoint; skip probe
        return null;
    }

    // -- package-visible for tests --

    String buildRequestJson(VLMOCRConfig config, String base64Data, String mimeType) {
        ObjectNode root = MAPPER.createObjectNode();
        root.put("model", config.getModel());
        root.put("max_tokens", config.getMaxTokens());

        ArrayNode messages = root.putArray("messages");
        ObjectNode userMessage = messages.addObject();
        userMessage.put("role", "user");

        ArrayNode content = userMessage.putArray("content");

        // Content block: image or document depending on MIME type
        ObjectNode mediaPart = content.addObject();
        if (mimeType.equals("application/pdf")) {
            mediaPart.put("type", "document");
        } else {
            mediaPart.put("type", "image");
        }
        ObjectNode source = mediaPart.putObject("source");
        source.put("type", "base64");
        source.put("media_type", mimeType);
        source.put("data", base64Data);

        // Text block: the prompt
        ObjectNode textPart = content.addObject();
        textPart.put("type", "text");
        textPart.put("text", config.getPrompt());

        return root.toString();
    }
}