CorruptUnknownValueTest.java

/*******************************************************************************
 * Copyright (c) 2025 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf.model;

import static org.junit.jupiter.api.Assertions.assertTrue;

import java.nio.charset.StandardCharsets;

import org.apache.commons.codec.binary.Hex;
import org.junit.jupiter.api.Test;

/**
 * Unit tests for {@link CorruptUnknownValue#getLabel()} recovery behavior.
 */
public class CorruptUnknownValueTest {

	private static CorruptUnknownValue valueWithData(byte[] data) {
		return new CorruptUnknownValue(null, 456, data);
	}

	@Test
	public void recoversLongestValidUtf8Substring() {
		byte[] invalid1 = new byte[] { (byte) 0xC3, (byte) 0x28 }; // invalid UTF-8
		byte[] validLong = "validlong".getBytes(StandardCharsets.UTF_8);
		byte[] invalid2 = new byte[] { (byte) 0xC0, (byte) 0xAF }; // invalid UTF-8
		byte[] validShort = "abc".getBytes(StandardCharsets.UTF_8);

		byte[] data = new byte[invalid1.length + validLong.length + invalid2.length + validShort.length];
		int pos = 0;
		System.arraycopy(invalid1, 0, data, pos, invalid1.length);
		pos += invalid1.length;
		System.arraycopy(validLong, 0, data, pos, validLong.length);
		pos += validLong.length;
		System.arraycopy(invalid2, 0, data, pos, invalid2.length);
		pos += invalid2.length;
		System.arraycopy(validShort, 0, data, pos, validShort.length);

		CorruptUnknownValue v = valueWithData(data);
		String label = v.getLabel();

		assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: "));
		assertTrue(label.contains("validlong"), "Should recover the core decodable segment");
	}

	@Test
	public void fallsBackToHexWhenNoDecodableSubstring() {
		byte[] data = new byte[] { (byte) 0x80, (byte) 0x81, (byte) 0xFE, (byte) 0xFF };
		CorruptUnknownValue v = valueWithData(data);

		String label = v.getLabel();
		String expectedHex = Hex.encodeHexString(data);

		assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: "));
		assertTrue(label.contains(expectedHex), "Should fall back to hex encoding when undecodable");
	}

	@Test
	public void stopsParsingAtTripleZeroSentinel() {
		byte[] valid = "xyz".getBytes(StandardCharsets.UTF_8);
		byte[] sentinel = new byte[] { 0, 0, 0 };
		byte[] tail = "end".getBytes(StandardCharsets.UTF_8);

		byte[] data = new byte[valid.length + sentinel.length + tail.length];
		int pos = 0;
		System.arraycopy(valid, 0, data, pos, valid.length);
		pos += valid.length;
		System.arraycopy(sentinel, 0, data, pos, sentinel.length);
		pos += sentinel.length;
		System.arraycopy(tail, 0, data, pos, tail.length);

		CorruptUnknownValue v = valueWithData(data);
		String label = v.getLabel();

		assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: "));
		assertTrue(label.contains("xyz"), "Should use data before sentinel");
		assertTrue(!label.contains("end"), "Should not parse past sentinel");
	}
}