CorruptIRIOrBNode.java
/*******************************************************************************
* Copyright (c) 2024 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/
package org.eclipse.rdf4j.sail.nativerdf.model;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.commons.codec.binary.Hex;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.sail.nativerdf.NativeStore;
import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision;
import com.google.common.net.UrlEscapers;
/**
* CorruptIRIOrBNode is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled
*
* @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES .
*
* @author H��vard M. Ottestad
*/
public class CorruptIRIOrBNode extends CorruptValue implements IRI, BNode {
private static final long serialVersionUID = 3709784393454516043L;
public CorruptIRIOrBNode(ValueStoreRevision revision, int internalID, byte[] data) {
super(revision, internalID, data);
}
@Override
public String toString() {
return stringValue();
}
public String stringValue() {
try {
return getNamespace() + ":" + getLocalName();
} catch (Throwable ignored) {
}
return "CorruptIRIOrBNode_with_ID_" + getInternalID();
}
@Override
public Type getType() {
return IRI.super.getType();
}
@Override
public String getNamespace() {
return "urn:CorruptIRIOrBNode:";
}
@Override
public String getLocalName() {
byte[] data = getData();
if (data != null && data.length > 0) {
// check if all bytes are zero
boolean allZero = true;
for (byte b : data) {
if (b != 0) {
allZero = false;
break;
}
}
if (allZero) {
return "CORRUPT_ID_" + getInternalID() + "_all_" + data.length + "_data_bytes_are_0x00";
}
data = truncateData(data);
// 1) Try full UTF-8 decode of the slice
if (data.length > 0) {
try {
String utf8 = new String(data, StandardCharsets.UTF_8);
// If replacement character is not present, we got a clean decode
if (utf8.indexOf('\uFFFD') < 0 && !utf8.trim().isEmpty()) {
return "CORRUPT_ID_" + getInternalID() + "_" + UrlEscapers.urlPathSegmentEscaper().escape(utf8);
}
} catch (Throwable ignored) {
// fall through to recovery strategies
}
}
// 2) Try to narrow down to a valid UTF-8 decodable substring (avoid replacement char)
String recoveredUtf8 = null;
int bestByteLen = 0;
for (int start = 0; start < data.length; start++) {
for (int end = data.length; end > start; end--) {
int candidateLen = end - start;
if (candidateLen <= bestByteLen) {
break; // can't beat current best
}
try {
String s = new String(data, start, candidateLen, StandardCharsets.UTF_8);
if (s.indexOf('\uFFFD') < 0) {
recoveredUtf8 = s;
bestByteLen = candidateLen;
break; // no need to try smaller end for this start
}
} catch (Throwable ignored) {
// continue scanning
}
}
}
if (recoveredUtf8 != null && !recoveredUtf8.trim().isEmpty()) {
return "CORRUPT_ID_" + getInternalID() + "_"
+ UrlEscapers.urlPathSegmentEscaper().escape(recoveredUtf8);
}
// 3) Try ASCII: find the longest contiguous run of printable US-ASCII bytes and use that
int bestAsciiStart = -1;
int bestAsciiLen = 0;
int i = 0;
while (i < data.length) {
// printable ASCII range 0x20 (space) to 0x7E (~)
if (data[i] >= 0x20 && data[i] <= 0x7E) {
int runStart = i;
while (i < data.length && data[i] >= 0x20 && data[i] <= 0x7E) {
i++;
}
int runLen = i - runStart;
if (runLen > bestAsciiLen) {
bestAsciiLen = runLen;
bestAsciiStart = runStart;
}
} else {
i++;
}
}
if (bestAsciiLen > 0) {
String ascii = new String(data, bestAsciiStart, bestAsciiLen, StandardCharsets.US_ASCII);
if (!ascii.trim().isEmpty()) {
return "CORRUPT_ID_" + getInternalID() + "_" + UrlEscapers.urlPathSegmentEscaper().escape(ascii);
}
}
// 4) Fallback: hex-encode the entire raw data
return "CORRUPT_ID_" + getInternalID() + "_HEX_"
+ Hex.encodeHexString(Arrays.copyOfRange(data, 0, data.length));
}
return "CORRUPT_ID_" + getInternalID();
}
@Override
public String getID() {
return "";
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o instanceof CorruptIRIOrBNode && getInternalID() != NativeValue.UNKNOWN_ID) {
CorruptIRIOrBNode otherCorruptValue = (CorruptIRIOrBNode) o;
if (otherCorruptValue.getInternalID() != NativeValue.UNKNOWN_ID
&& getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) {
// CorruptValue is from the same revision of the same native store with both IDs set
return getInternalID() == otherCorruptValue.getInternalID();
}
}
return super.equals(o);
}
}