CorruptIRIOrBNodeTest.java

/*******************************************************************************
 * Copyright (c) 2025 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf.model;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.nio.charset.StandardCharsets;

import org.apache.commons.codec.binary.Hex;
import org.junit.jupiter.api.Test;

/**
 * Unit tests for {@link CorruptIRIOrBNode#getLocalName()} recovery behavior.
 */
public class CorruptIRIOrBNodeTest {

	private static CorruptIRIOrBNode nodeWithData(byte[] data) {
		return new CorruptIRIOrBNode(null, 123, data);
	}

	@Test
	public void recoversLongestValidUtf8Substring() {
		// Prepare a byte array with 5-byte header followed by: invalid, valid ASCII/UTF-8, invalid, short valid
		byte[] header = new byte[] { 0, 0, 0, 0, 0 };
		byte[] invalid1 = new byte[] { (byte) 0xC3, (byte) 0x28 }; // invalid UTF-8 sequence
		byte[] validLong = "validlong".getBytes(StandardCharsets.UTF_8);
		byte[] invalid2 = new byte[] { (byte) 0xC0, (byte) 0xAF }; // invalid UTF-8 sequence
		byte[] validShort = "abc".getBytes(StandardCharsets.UTF_8);

		byte[] data = new byte[header.length + invalid1.length + validLong.length + invalid2.length
				+ validShort.length];
		int pos = 0;
		System.arraycopy(header, 0, data, pos, header.length);
		pos += header.length;
		System.arraycopy(invalid1, 0, data, pos, invalid1.length);
		pos += invalid1.length;
		System.arraycopy(validLong, 0, data, pos, validLong.length);
		pos += validLong.length;
		System.arraycopy(invalid2, 0, data, pos, invalid2.length);
		pos += invalid2.length;
		System.arraycopy(validShort, 0, data, pos, validShort.length);

		CorruptIRIOrBNode node = nodeWithData(data);
		String localName = node.getLocalName();

		// Expect a valid decodable segment to be chosen containing the core text
		assertTrue(localName.startsWith("CORRUPT_"), "Should be prefixed with CORRUPT_");
		assertTrue(localName.contains("validlong"), "Should recover the core decodable segment");
	}

	@Test
	public void fallsBackToHexWhenNoDecodableSubstring() {
		// Prepare a byte array with 5-byte header followed by bytes with no ASCII/UTF-8 decodable sequences
		byte[] header = new byte[] { 0, 0, 0, 0, 0 };
		byte[] body = new byte[] { (byte) 0x80, (byte) 0x81, (byte) 0xFE, (byte) 0xFF };

		byte[] data = new byte[header.length + body.length];
		System.arraycopy(header, 0, data, 0, header.length);
		System.arraycopy(body, 0, data, header.length, body.length);

		CorruptIRIOrBNode node = nodeWithData(data);
		String expectedHex = Hex.encodeHexString(stripLeavingZeros(data));

		String localName = node.getLocalName();
		assertTrue(localName.startsWith("CORRUPT_"), "Should be prefixed with CORRUPT_");
		assertEquals("CORRUPT_ID_" + node.getInternalID() + "_HEX_" + expectedHex, localName);
	}

	private byte[] stripLeavingZeros(byte[] data) {
		int firstNonZero = 0;
		for (int i = 0; i < data.length; i++) {
			if (data[i] != 0) {
				firstNonZero = i;
				break;
			}
		}
		byte[] stripped = new byte[data.length - firstNonZero];
		System.arraycopy(data, firstNonZero, stripped, 0, stripped.length);
		return stripped;
	}

	@Test
	public void stopsParsingAtTripleZeroSentinel() {
		byte[] header = new byte[] { 0, 0, 0, 0, 0 };
		byte[] valid = "abc".getBytes(StandardCharsets.UTF_8);
		byte[] sentinel = new byte[] { 0, 0, 0 };
		byte[] tail = "tail".getBytes(StandardCharsets.UTF_8);

		byte[] data = new byte[header.length + valid.length + sentinel.length + tail.length];
		int pos = 0;
		System.arraycopy(header, 0, data, pos, header.length);
		pos += header.length;
		System.arraycopy(valid, 0, data, pos, valid.length);
		pos += valid.length;
		System.arraycopy(sentinel, 0, data, pos, sentinel.length);
		pos += sentinel.length;
		System.arraycopy(tail, 0, data, pos, tail.length);

		CorruptIRIOrBNode node = nodeWithData(data);
		String localName = node.getLocalName();

		assertTrue(localName.startsWith("CORRUPT_"));
		assertTrue(localName.contains("abc"), "Should recover text before sentinel");
		assertTrue(!localName.contains("tail"), "Should not parse past sentinel");
	}

	@Test
	public void ignoresLeadingZerosBeforeSentinel() {
		byte[] header = new byte[] { 0, 0, 0, 0, 0 };
		byte[] leadingZeros = new byte[] { 0, 0, 0, 0, 0, 0 };
		byte[] valid = "abc".getBytes(StandardCharsets.UTF_8);
		byte[] sentinel = new byte[] { 0, 0, 0 };
		byte[] tail = "tail".getBytes(StandardCharsets.UTF_8);

		byte[] data = new byte[header.length + leadingZeros.length + valid.length + sentinel.length + tail.length];
		int pos = 0;
		System.arraycopy(header, 0, data, pos, header.length);
		pos += header.length;
		System.arraycopy(leadingZeros, 0, data, pos, leadingZeros.length);
		pos += leadingZeros.length;
		System.arraycopy(valid, 0, data, pos, valid.length);
		pos += valid.length;
		System.arraycopy(sentinel, 0, data, pos, sentinel.length);
		pos += sentinel.length;
		System.arraycopy(tail, 0, data, pos, tail.length);

		CorruptIRIOrBNode node = nodeWithData(data);
		String localName = node.getLocalName();

		assertTrue(localName.startsWith("CORRUPT_"));
		assertTrue(localName.contains("abc"), "Should recover data after leading zeros");
		assertTrue(!localName.contains("tail"), "Should stop at sentinel after non-zero encountered");
	}
}