ThemeDataSetGeneratorRandomizationTest.java

/*******************************************************************************
 * Copyright (c) 2025 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
// Some portions generated by Codex
package org.eclipse.rdf4j.benchmark.rio.util;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.util.Random;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.junit.jupiter.api.Test;

class ThemeDataSetGeneratorRandomizationTest {

	private static final String BASE = "http://example.com/theme/";
	private static final long JITTER_SEED_XOR = 0x9E3779B97F4A7C15L;

	@Test
	void medicalConfigCountsAreJitteredDeterministically() {
		long seed = 123L;
		ThemeDataSetGenerator.MedicalConfig config = ThemeDataSetGenerator.medicalConfig()
				.withPatientCount(10)
				.withEncountersPerPatient(4)
				.withConditionsPerEncounter(3)
				.withMedicationsPerPatient(2)
				.withObservationsPerEncounter(2)
				.withPractitionerCount(6)
				.withSeed(seed);

		Model model = ThemeDataSetGenerator.generateMedicalRecords(config);
		Model modelRepeat = ThemeDataSetGenerator.generateMedicalRecords(config);
		assertEquals(model, modelRepeat, "Dataset should be stable for the same seed and config");

		IRI patientType = Values.iri(BASE, "medical/Patient");
		IRI practitionerType = Values.iri(BASE, "medical/Practitioner");
		IRI encounterType = Values.iri(BASE, "medical/Encounter");
		IRI conditionType = Values.iri(BASE, "medical/Condition");
		IRI medicationType = Values.iri(BASE, "medical/Medication");
		IRI observationType = Values.iri(BASE, "medical/Observation");

		long actualPatients = countType(model, patientType);
		long actualPractitioners = countType(model, practitionerType);
		long actualEncounters = countType(model, encounterType);
		long actualConditions = countType(model, conditionType);
		long actualMedications = countType(model, medicationType);
		long actualObservations = countType(model, observationType);

		Random jitter = new Random(seed ^ JITTER_SEED_XOR);
		int expectedPractitioners = jitterInt(jitter, 6, 1);
		int expectedPatients = jitterInt(jitter, 10, 1);
		long expectedMedications = 0;
		long expectedEncounters = 0;
		long expectedConditions = 0;
		long expectedObservations = 0;
		for (int p = 0; p < expectedPatients; p++) {
			expectedMedications += jitterInt(jitter, 2, 1);
			int encounters = jitterInt(jitter, 4, 1);
			expectedEncounters += encounters;
			for (int e = 0; e < encounters; e++) {
				expectedConditions += jitterInt(jitter, 3, 1);
				expectedObservations += jitterInt(jitter, 2, 1);
			}
		}

		assertEquals(expectedPatients, actualPatients, "Unexpected patient count");
		assertEquals(expectedPractitioners, actualPractitioners, "Unexpected practitioner count");
		assertEquals(expectedEncounters, actualEncounters, "Unexpected encounter count");
		assertEquals(expectedConditions, actualConditions, "Unexpected condition count");
		assertEquals(expectedMedications, actualMedications, "Unexpected medication count");
		assertEquals(expectedObservations, actualObservations, "Unexpected observation count");
	}

	private static long countType(Model model, IRI type) {
		return model.filter(null, RDF.TYPE, type).subjects().size();
	}

	private static int jitterInt(Random random, int base, int minValue) {
		int delta = base / 2;
		int min = Math.max(minValue, base - delta);
		int max = Math.max(min, base + delta);
		if (min == max) {
			return min;
		}
		return min + random.nextInt(max - min + 1);
	}
}