ChunkSerializerTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.inference;

import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.util.List;

import org.junit.jupiter.api.Test;

import org.apache.tika.inference.locator.Locators;
import org.apache.tika.inference.locator.PaginatedLocator;
import org.apache.tika.inference.locator.SpatialLocator;
import org.apache.tika.inference.locator.TemporalLocator;
import org.apache.tika.inference.locator.TextLocator;

public class ChunkSerializerTest {

    @Test
    void testRoundTripTextLocator() throws Exception {
        Chunk c1 = new Chunk("Hello world", 0, 11);
        c1.setVector(new float[]{0.1f, 0.2f, 0.3f});

        Chunk c2 = new Chunk("Goodbye", 12, 19);
        c2.setVector(new float[]{0.4f, 0.5f, 0.6f});

        String json = ChunkSerializer.toJson(List.of(c1, c2));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals(2, restored.size());

        assertEquals("Hello world", restored.get(0).getText());
        assertNotNull(restored.get(0).getLocators().getText());
        assertEquals(1, restored.get(0).getLocators().getText().size());
        assertEquals(0, restored.get(0).getLocators().getText().get(0).getStartOffset());
        assertEquals(11, restored.get(0).getLocators().getText().get(0).getEndOffset());
        assertArrayEquals(new float[]{0.1f, 0.2f, 0.3f},
                restored.get(0).getVector(), 1e-6f);

        assertEquals("Goodbye", restored.get(1).getText());
        assertEquals(12, restored.get(1).getLocators().getText().get(0).getStartOffset());
    }

    @Test
    void testPaginatedLocator() throws Exception {
        Locators loc = new Locators()
                .addText(new TextLocator(0, 100))
                .addPaginated(new PaginatedLocator(4, new float[]{0.1f, 0.1f, 0.3f, 0.5f}))
                .addPaginated(new PaginatedLocator(5, new float[]{0.8f, 0.1f, 0.9f, 0.5f}));

        Chunk c = new Chunk("Spanning two pages", loc);
        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals(1, restored.size());
        Locators rl = restored.get(0).getLocators();
        assertEquals(1, rl.getText().size());
        assertEquals(2, rl.getPaginated().size());
        assertEquals(4, rl.getPaginated().get(0).getPage());
        assertArrayEquals(new float[]{0.1f, 0.1f, 0.3f, 0.5f},
                rl.getPaginated().get(0).getBbox(), 1e-6f);
        assertEquals(5, rl.getPaginated().get(1).getPage());
    }

    @Test
    void testSpatialLocator() throws Exception {
        Locators loc = new Locators()
                .addSpatial(new SpatialLocator(
                        new float[]{0.2f, 0.2f, 0.4f, 0.4f}, "leak_point"))
                .addSpatial(new SpatialLocator(
                        new float[]{0.5f, 0.5f, 0.7f, 0.7f}, null));

        Chunk c = new Chunk(null, loc);
        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertNull(restored.get(0).getText());
        assertEquals(2, restored.get(0).getLocators().getSpatial().size());
        assertEquals("leak_point",
                restored.get(0).getLocators().getSpatial().get(0).getLabel());
        assertNull(restored.get(0).getLocators().getSpatial().get(1).getLabel());
    }

    @Test
    void testTemporalLocator() throws Exception {
        Locators loc = new Locators()
                .addTemporal(new TemporalLocator(12000, 15000))
                .addTemporal(new TemporalLocator(45000, 48000));

        Chunk c = new Chunk("speech segment", loc);
        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals(2, restored.get(0).getLocators().getTemporal().size());
        assertEquals(12000,
                restored.get(0).getLocators().getTemporal().get(0).getStartMs());
        assertEquals(48000,
                restored.get(0).getLocators().getTemporal().get(1).getEndMs());
    }

    @Test
    void testAllLocatorTypes() throws Exception {
        Locators loc = new Locators()
                .addText(new TextLocator(0, 500))
                .addPaginated(new PaginatedLocator(3))
                .addSpatial(new SpatialLocator(
                        new float[]{0.1f, 0.2f, 0.3f, 0.4f}, "table"))
                .addTemporal(new TemporalLocator(0, 5000));

        Chunk c = new Chunk("multimodal chunk", loc);
        c.setVector(new float[]{1.0f, 2.0f});

        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        Locators rl = restored.get(0).getLocators();
        assertEquals(1, rl.getText().size());
        assertEquals(1, rl.getPaginated().size());
        assertEquals(1, rl.getSpatial().size());
        assertEquals(1, rl.getTemporal().size());
        assertNotNull(restored.get(0).getVector());
    }

    @Test
    void testPaginatedWithoutBbox() throws Exception {
        Locators loc = new Locators()
                .addPaginated(new PaginatedLocator(7));

        Chunk c = new Chunk("whole page", loc);
        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals(7, restored.get(0).getLocators().getPaginated().get(0).getPage());
        assertNull(restored.get(0).getLocators().getPaginated().get(0).getBbox());
    }

    @Test
    void testWithoutVector() throws Exception {
        Chunk c = new Chunk("No vector", 0, 9);
        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals(1, restored.size());
        assertEquals("No vector", restored.get(0).getText());
        assertNull(restored.get(0).getVector());
    }

    @Test
    void testEmptyList() throws Exception {
        String json = ChunkSerializer.toJson(List.of());
        assertEquals("[]", json);
        assertEquals(0, ChunkSerializer.fromJson(json).size());
    }

    @Test
    void testSpecialCharacters() throws Exception {
        Chunk c = new Chunk("He said \"hello\" & <goodbye>", 0, 27);
        c.setVector(new float[]{1.0f});

        String json = ChunkSerializer.toJson(List.of(c));
        List<Chunk> restored = ChunkSerializer.fromJson(json);

        assertEquals("He said \"hello\" & <goodbye>",
                restored.get(0).getText());
    }
}