ChunkSerializer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.inference;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import org.apache.tika.inference.locator.Locators;
import org.apache.tika.inference.locator.PaginatedLocator;
import org.apache.tika.inference.locator.SpatialLocator;
import org.apache.tika.inference.locator.TemporalLocator;
import org.apache.tika.inference.locator.TextLocator;
import org.apache.tika.metadata.TikaCoreProperties;

/**
 * Serializes and deserializes a list of {@link Chunk} objects to/from JSON.
 * Vectors are stored as base64-encoded little-endian float32 via
 * {@link VectorSerializer}. Locators are nested under a {@code "locators"}
 * object with optional {@code text}, {@code paginated}, {@code spatial},
 * and {@code temporal} arrays.
 */
public final class ChunkSerializer {

    private static final ObjectMapper MAPPER = new ObjectMapper();

    private ChunkSerializer() {
    }

    /**
     * Serialize chunks to a JSON array string.
     */
    public static String toJson(List<Chunk> chunks) throws IOException {
        ArrayNode array = MAPPER.createArrayNode();
        for (Chunk chunk : chunks) {
            ObjectNode node = array.addObject();
            if (chunk.getText() != null) {
                node.put("text", chunk.getText());
            }
            if (chunk.getVector() != null) {
                node.put("vector", VectorSerializer.encode(chunk.getVector()));
            }
            serializeLocators(node, chunk.getLocators());
        }
        return MAPPER.writeValueAsString(array);
    }

    /**
     * Reads any existing chunks from the metadata field, appends the
     * new chunks, and writes the merged list back. This allows
     * multiple components (text chunker, image embedder, etc.) to
     * contribute to the same chunks array.
     *
     * @param metadata  the metadata to read from and write to
     * @param newChunks chunks to append
     */
    public static void mergeInto(
            org.apache.tika.metadata.Metadata metadata,
            List<Chunk> newChunks) throws IOException {
        List<Chunk> existing;
        String current = metadata.get(TikaCoreProperties.TIKA_CHUNKS);
        if (current != null && !current.isEmpty()) {
            existing = fromJson(current);
        } else {
            existing = new ArrayList<>();
        }
        existing.addAll(newChunks);
        metadata.set(TikaCoreProperties.TIKA_CHUNKS, toJson(existing));
    }

    /**
     * Deserialize a JSON array string back to a list of chunks.
     */
    public static List<Chunk> fromJson(String json) throws IOException {
        JsonNode array = MAPPER.readTree(json);
        List<Chunk> chunks = new ArrayList<>();
        for (JsonNode node : array) {
            String text = node.has("text") ? node.get("text").asText() : null;
            Locators locators = deserializeLocators(node.get("locators"));

            Chunk chunk = new Chunk(text, locators);

            JsonNode vectorNode = node.get("vector");
            if (vectorNode != null && !vectorNode.isNull()) {
                chunk.setVector(VectorSerializer.decode(vectorNode.asText()));
            }
            chunks.add(chunk);
        }
        return chunks;
    }

    // ---- locator serialization --------------------------------------------

    private static void serializeLocators(ObjectNode parent, Locators loc) {
        if (loc == null || loc.isEmpty()) {
            return;
        }

        ObjectNode locNode = parent.putObject("locators");

        if (loc.getText() != null && !loc.getText().isEmpty()) {
            ArrayNode arr = locNode.putArray("text");
            for (TextLocator t : loc.getText()) {
                ObjectNode o = arr.addObject();
                o.put("start_offset", t.getStartOffset());
                o.put("end_offset", t.getEndOffset());
            }
        }

        if (loc.getPaginated() != null && !loc.getPaginated().isEmpty()) {
            ArrayNode arr = locNode.putArray("paginated");
            for (PaginatedLocator p : loc.getPaginated()) {
                ObjectNode o = arr.addObject();
                o.put("page", p.getPage());
                if (p.getBbox() != null) {
                    ArrayNode bboxArr = o.putArray("bbox");
                    for (float v : p.getBbox()) {
                        bboxArr.add(v);
                    }
                }
            }
        }

        if (loc.getSpatial() != null && !loc.getSpatial().isEmpty()) {
            ArrayNode arr = locNode.putArray("spatial");
            for (SpatialLocator s : loc.getSpatial()) {
                ObjectNode o = arr.addObject();
                if (s.getBbox() != null) {
                    ArrayNode bboxArr = o.putArray("bbox");
                    for (float v : s.getBbox()) {
                        bboxArr.add(v);
                    }
                }
                if (s.getLabel() != null) {
                    o.put("label", s.getLabel());
                }
            }
        }

        if (loc.getTemporal() != null && !loc.getTemporal().isEmpty()) {
            ArrayNode arr = locNode.putArray("temporal");
            for (TemporalLocator t : loc.getTemporal()) {
                ObjectNode o = arr.addObject();
                o.put("start_ms", t.getStartMs());
                o.put("end_ms", t.getEndMs());
            }
        }
    }

    private static Locators deserializeLocators(JsonNode locNode) {
        Locators locators = new Locators();
        if (locNode == null || locNode.isNull()) {
            return locators;
        }

        JsonNode textArr = locNode.get("text");
        if (textArr != null && textArr.isArray()) {
            for (JsonNode n : textArr) {
                locators.addText(new TextLocator(
                        n.get("start_offset").asInt(),
                        n.get("end_offset").asInt()));
            }
        }

        JsonNode pagArr = locNode.get("paginated");
        if (pagArr != null && pagArr.isArray()) {
            for (JsonNode n : pagArr) {
                float[] bbox = deserializeBbox(n.get("bbox"));
                locators.addPaginated(
                        new PaginatedLocator(n.get("page").asInt(), bbox));
            }
        }

        JsonNode spatArr = locNode.get("spatial");
        if (spatArr != null && spatArr.isArray()) {
            for (JsonNode n : spatArr) {
                float[] bbox = deserializeBbox(n.get("bbox"));
                String label = n.has("label") ? n.get("label").asText() : null;
                locators.addSpatial(new SpatialLocator(bbox, label));
            }
        }

        JsonNode tempArr = locNode.get("temporal");
        if (tempArr != null && tempArr.isArray()) {
            for (JsonNode n : tempArr) {
                locators.addTemporal(new TemporalLocator(
                        n.get("start_ms").asLong(),
                        n.get("end_ms").asLong()));
            }
        }

        return locators;
    }

    private static float[] deserializeBbox(JsonNode bboxNode) {
        if (bboxNode == null || !bboxNode.isArray()) {
            return null;
        }
        float[] bbox = new float[bboxNode.size()];
        for (int i = 0; i < bboxNode.size(); i++) {
            bbox[i] = (float) bboxNode.get(i).asDouble();
        }
        return bbox;
    }
}