VarNameNormalizer.java

/*******************************************************************************
 * Copyright (c) 2025 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 ******************************************************************************/
package org.eclipse.rdf4j.queryrender;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
 * Normalizes anonymous variable tokens so structurally identical trees compare equal even if hashed suffixes differ.
 * Standalone identifiers only (left boundary must be a non-word char). Word chars = [A-Za-z0-9_].
 *
 * Families are prefixes (including trailing underscore), e.g. "_anon_path_". Pre-numbered tails (digits-only) are
 * preserved and reserve their numbers.
 */
public final class VarNameNormalizer {

	private static final List<String> DEFAULT_PREFIXES = Arrays.asList(
			"_anon_collection_",
			"_anon_path_inverse_",
			"_anon_path_",
			"_anon_having_",
			"_anon_"
	);

	private VarNameNormalizer() {
	}

	public static String normalizeVars(String input) {
		return normalizeVars(input, DEFAULT_PREFIXES);
	}

	public static String normalizeVars(String input, List<String> families) {
		if (input == null || input.isEmpty()) {
			return input;
		}

		// Longest-first so more specific families win (e.g., path_inverse before path).
		List<String> fams = new ArrayList<>(families);
		fams.sort((a, b) -> Integer.compare(b.length(), a.length()));

		// Reserve numbers per family with BitSet for O(1) next-id.
		final Map<String, BitSet> reserved = new HashMap<>();
		for (String f : fams) {
			reserved.put(f, new BitSet());
		}

		// If there is a shared underscore-terminated prefix (e.g., "_anon_"), use the fast path.
		final String shared = sharedPrefixEndingWithUnderscore(fams);

		if (!shared.isEmpty()) {
			reservePreNumberedFast(input, fams, reserved, shared);
			return rewriteHashedFast(input, fams, reserved, shared);
		}

		// Generic path: bucket by first char; still no regionMatches.
		final Map<Character, List<String>> byFirst = bucketByFirstChar(fams);
		reservePreNumberedGeneric(input, byFirst, reserved);
		return rewriteHashedGeneric(input, byFirst, reserved);
	}

	/* ============================ Fast path (shared prefix) ============================ */

	private static void reservePreNumberedFast(String s, List<String> fams, Map<String, BitSet> reserved,
			String shared) {
		final int n = s.length();
		int i = s.indexOf(shared, 0);
		while (i >= 0) {
			if ((i == 0 || !isWordChar(s.charAt(i - 1)))) {
				String family = matchFamilyAt(s, i, fams);
				if (family != null) {
					final int tailStart = i + family.length();
					if (tailStart < n && isWordChar(s.charAt(tailStart))) {
						int j = tailStart + 1;
						while (j < n && isWordChar(s.charAt(j))) {
							j++;
						}
						int num = parsePositiveIntOrMinusOne(s, tailStart, j);
						if (num >= 0) {
							reserved.get(family).set(num);
						}
					}
				}
			}
			i = s.indexOf(shared, i + 1);
		}
	}

	private static String rewriteHashedFast(String s, List<String> fams, Map<String, BitSet> reserved, String shared) {
		final int n = s.length();
		final StringBuilder out = new StringBuilder(n + 16);
		final Map<String, String> mapping = new LinkedHashMap<>();

		int writePos = 0;
		int i = s.indexOf(shared, 0);
		while (i >= 0) {
			if (!(i == 0 || !isWordChar(s.charAt(i - 1)))) {
				i = s.indexOf(shared, i + 1);
				continue;
			}

			String family = matchFamilyAt(s, i, fams);
			if (family == null) {
				i = s.indexOf(shared, i + 1);
				continue;
			}

			final int tailStart = i + family.length();
			if (tailStart >= n || !isWordChar(s.charAt(tailStart))) {
				i = s.indexOf(shared, i + 1);
				continue;
			}

			int j = tailStart + 1;
			while (j < n && isWordChar(s.charAt(j))) {
				j++;
			}

			if (isAllDigits(s, tailStart, j)) {
				// keep as-is
				out.append(s, writePos, j);
				writePos = j;
			} else {
				String original = s.substring(i, j); // small, acceptable allocation
				String replacement = mapping.get(original);
				if (replacement == null) {
					BitSet bs = reserved.get(family);
					int next = bs.nextClearBit(1);
					bs.set(next);
					replacement = family + next;
					mapping.put(original, replacement);
				}
				out.append(s, writePos, i).append(replacement);
				writePos = j;
			}

			i = s.indexOf(shared, j);
		}
		out.append(s, writePos, n);
		return out.toString();
	}

	/**
	 * Find the specific family that matches at offset i. fams must be sorted longest-first. No regionMatches; inline
	 * char checks.
	 */
	private static String matchFamilyAt(String s, int i, List<String> fams) {
		final int n = s.length();
		for (String f : fams) {
			int len = f.length();
			if (i + len > n) {
				continue;
			}
			// manual "startsWithAt"
			boolean ok = true;
			for (int k = 0; k < len; k++) {
				if (s.charAt(i + k) != f.charAt(k)) {
					ok = false;
					break;
				}
			}
			if (ok) {
				return f;
			}
		}
		return null;
	}

	/* ============================ Generic path (no common prefix) ============================ */

	private static void reservePreNumberedGeneric(String s, Map<Character, List<String>> byFirst,
			Map<String, BitSet> reserved) {
		final int n = s.length();
		for (int i = 0; i < n;) {
			char c = s.charAt(i);
			if (!(i == 0 || !isWordChar(s.charAt(i - 1)))) {
				i++;
				continue;
			}
			List<String> cand = byFirst.get(c);
			if (cand == null) {
				i++;
				continue;
			}

			String family = matchFamilyAtFromBucket(s, i, cand);
			if (family == null) {
				i++;
				continue;
			}

			int tailStart = i + family.length();
			if (tailStart >= n || !isWordChar(s.charAt(tailStart))) {
				i++;
				continue;
			}

			int j = tailStart + 1;
			while (j < n && isWordChar(s.charAt(j))) {
				j++;
			}

			int num = parsePositiveIntOrMinusOne(s, tailStart, j);
			if (num >= 0) {
				reserved.get(family).set(num);
			}

			i = j; // jump past the token
		}
	}

	private static String rewriteHashedGeneric(String s, Map<Character, List<String>> byFirst,
			Map<String, BitSet> reserved) {
		final int n = s.length();
		final StringBuilder out = new StringBuilder(n + 16);
		final Map<String, String> mapping = new LinkedHashMap<>();

		int writePos = 0;
		for (int i = 0; i < n;) {
			char c = s.charAt(i);
			if (!(i == 0 || !isWordChar(s.charAt(i - 1)))) {
				i++;
				continue;
			}
			List<String> cand = byFirst.get(c);
			if (cand == null) {
				i++;
				continue;
			}

			String family = matchFamilyAtFromBucket(s, i, cand);
			if (family == null) {
				i++;
				continue;
			}

			int tailStart = i + family.length();
			if (tailStart >= n || !isWordChar(s.charAt(tailStart))) {
				i++;
				continue;
			}

			int j = tailStart + 1;
			while (j < n && isWordChar(s.charAt(j))) {
				j++;
			}

			if (isAllDigits(s, tailStart, j)) {
				// keep as-is
				out.append(s, writePos, j);
				writePos = j;
			} else {
				String original = s.substring(i, j); // small, acceptable allocation
				String replacement = mapping.get(original);
				if (replacement == null) {
					BitSet bs = reserved.get(family);
					int next = bs.nextClearBit(1);
					bs.set(next);
					replacement = family + next;
					mapping.put(original, replacement);
				}
				out.append(s, writePos, i).append(replacement);
				writePos = j;
			}

			i = j;
		}
		out.append(s, writePos, n);
		return out.toString();
	}

	private static Map<Character, List<String>> bucketByFirstChar(List<String> fams) {
		final Map<Character, List<String>> byFirst = new HashMap<>();
		for (String f : fams) {
			char c = f.charAt(0);
			byFirst.computeIfAbsent(c, k -> new ArrayList<>()).add(f);
		}
		return byFirst;
	}

	private static String matchFamilyAtFromBucket(String s, int i, List<String> fams) {
		final int n = s.length();
		for (String f : fams) {
			int len = f.length();
			if (i + len > n) {
				continue;
			}
			boolean ok = true;
			for (int k = 0; k < len; k++) {
				if (s.charAt(i + k) != f.charAt(k)) {
					ok = false;
					break;
				}
			}
			if (ok) {
				return f;
			}
		}
		return null;
	}

	/* =============================== Utilities =============================== */

	private static String sharedPrefixEndingWithUnderscore(List<String> fams) {
		if (fams.isEmpty()) {
			return "";
		}
		char[] acc = fams.get(0).toCharArray();
		int end = acc.length;
		for (int i = 1; i < fams.size(); i++) {
			String f = fams.get(i);
			end = Math.min(end, f.length());
			for (int k = 0; k < end; k++) {
				if (acc[k] != f.charAt(k)) {
					end = k;
					break;
				}
			}
		}
		while (end > 0 && acc[end - 1] != '_') {
			end--;
		}
		if (end == 0) {
			return "";
		}
		return new String(acc, 0, end);
	}

	private static boolean isAllDigits(String s, int start, int end) {
		for (int i = start; i < end; i++) {
			if (!Character.isDigit(s.charAt(i))) {
				return false;
			}
		}
		return true;
	}

	private static boolean isWordChar(char c) {
		return Character.isLetterOrDigit(c) || c == '_';
	}

	private static int parsePositiveIntOrMinusOne(String s, int start, int end) {
		int n = 0;
		for (int i = start; i < end; i++) {
			char c = s.charAt(i);
			if (!Character.isDigit(c)) {
				return -1;
			}
			n = (n * 10) + (c - '0');
		}
		return n;
	}
}